]> granicus.if.org Git - liblinear/blob - linear.cpp
Newton solver is changed from trust region to line search
[liblinear] / linear.cpp
1 #include <math.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <stdarg.h>
6 #include <locale.h>
7 #include "linear.h"
8 #include "newton.h"
9 int liblinear_version = LIBLINEAR_VERSION;
10 typedef signed char schar;
11 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
12 #ifndef min
13 template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
14 #endif
15 #ifndef max
16 template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
17 #endif
18 template <class S, class T> static inline void clone(T*& dst, S* src, int n)
19 {
20         dst = new T[n];
21         memcpy((void *)dst,(void *)src,sizeof(T)*n);
22 }
23 #define INF HUGE_VAL
24 #define Malloc(type,n) (type *)malloc((n)*sizeof(type))
25
26 static void print_string_stdout(const char *s)
27 {
28         fputs(s,stdout);
29         fflush(stdout);
30 }
31 static void print_null(const char *s) {}
32
33 static void (*liblinear_print_string) (const char *) = &print_string_stdout;
34
35 #if 1
36 static void info(const char *fmt,...)
37 {
38         char buf[BUFSIZ];
39         va_list ap;
40         va_start(ap,fmt);
41         vsprintf(buf,fmt,ap);
42         va_end(ap);
43         (*liblinear_print_string)(buf);
44 }
45 #else
46 static void info(const char *fmt,...) {}
47 #endif
48 class sparse_operator
49 {
50 public:
51         static double nrm2_sq(const feature_node *x)
52         {
53                 double ret = 0;
54                 while(x->index != -1)
55                 {
56                         ret += x->value*x->value;
57                         x++;
58                 }
59                 return (ret);
60         }
61
62         static double dot(const double *s, const feature_node *x)
63         {
64                 double ret = 0;
65                 while(x->index != -1)
66                 {
67                         ret += s[x->index-1]*x->value;
68                         x++;
69                 }
70                 return (ret);
71         }
72
73         static double sparse_dot(const feature_node *x1, const feature_node *x2)
74         {
75                 double ret = 0;
76                 while(x1->index != -1 && x2->index != -1)
77                 {
78                         if(x1->index == x2->index)
79                         {
80                                 ret += x1->value * x2->value;
81                                 ++x1;
82                                 ++x2;
83                         }
84                         else
85                         {
86                                 if(x1->index > x2->index)
87                                         ++x2;
88                                 else
89                                         ++x1;
90                         }
91                 }
92                 return (ret);
93         }
94
95         static void axpy(const double a, const feature_node *x, double *y)
96         {
97                 while(x->index != -1)
98                 {
99                         y[x->index-1] += a*x->value;
100                         x++;
101                 }
102         }
103 };
104
105 // L2-regularized empirical risk minimization
106 // min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss
107
108 class l2r_erm_fun: public function
109 {
110 public:
111         l2r_erm_fun(const problem *prob, const parameter *param, double *C);
112         ~l2r_erm_fun();
113
114         double fun(double *w);
115         double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha);
116         int get_nr_variable(void);
117
118 protected:
119         virtual double C_times_loss(int i, double wx_i) = 0;
120         void Xv(double *v, double *Xv);
121         void XTv(double *v, double *XTv);
122
123         double *C;
124         const problem *prob;
125         double *wx;
126         double *tmp; // a working array
127         double wTw;
128         int regularize_bias;
129 };
130
131 l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C)
132 {
133         int l=prob->l;
134
135         this->prob = prob;
136
137         wx = new double[l];
138         tmp = new double[l];
139         this->C = C;
140         this->regularize_bias = param->regularize_bias;
141 }
142
143 l2r_erm_fun::~l2r_erm_fun()
144 {
145         delete[] wx;
146         delete[] tmp;
147 }
148
149 double l2r_erm_fun::fun(double *w)
150 {
151         int i;
152         double f=0;
153         int l=prob->l;
154         int w_size=get_nr_variable();
155
156         wTw = 0;
157         Xv(w, wx);
158
159         for(i=0;i<w_size;i++)
160                 wTw += w[i]*w[i];
161         if(regularize_bias == 0)
162                 wTw -= w[w_size-1]*w[w_size-1];
163         for(i=0;i<l;i++)
164                 f += C_times_loss(i, wx[i]);
165         f = f + 0.5 * wTw;
166
167         return(f);
168 }
169
170 int l2r_erm_fun::get_nr_variable(void)
171 {
172         return prob->n;
173 }
174
175 // On entry *f must be the function value of w
176 // On exit w is updated and *f is the new function value
177 double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
178 {
179         int i;
180         int l = prob->l;
181         double sTs = 0;
182         double wTs = 0;
183         double gTs = 0;
184         double eta = 0.01;
185         int w_size = get_nr_variable();
186         int max_num_linesearch = 20;
187         double fold = *f;
188         Xv(s, tmp);
189
190         for (i=0;i<w_size;i++)
191         {
192                 sTs += s[i] * s[i];
193                 wTs += s[i] * w[i];
194                 gTs += s[i] * g[i];
195         }
196         if(regularize_bias == 0)
197         {
198                 // bias not used in calculating (w + \alpha s)^T (w + \alpha s)
199                 sTs -= s[w_size-1] * s[w_size-1];
200                 wTs -= s[w_size-1] * w[w_size-1];
201         }
202
203         int num_linesearch = 0;
204         for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
205         {
206                 double loss = 0;
207                 for(i=0;i<l;i++)
208                 {
209                         double inner_product = tmp[i] * alpha + wx[i];
210                         loss += C_times_loss(i, inner_product);
211                 }
212                 *f = loss + (alpha * alpha * sTs + wTw) / 2.0 + alpha * wTs;
213                 if (*f - fold <= eta * alpha * gTs)
214                 {
215                         for (i=0;i<l;i++)
216                                 wx[i] += alpha * tmp[i];
217                         break;
218                 }
219                 else
220                         alpha *= 0.5;
221         }
222
223         if (num_linesearch >= max_num_linesearch)
224         {
225                 *f = fold;
226                 return 0;
227         }
228         else
229                 for (i=0;i<w_size;i++)
230                         w[i] += alpha * s[i];
231
232         wTw += alpha * alpha * sTs + 2* alpha * wTs;
233         return alpha;
234 }
235
236 void l2r_erm_fun::Xv(double *v, double *Xv)
237 {
238         int i;
239         int l=prob->l;
240         feature_node **x=prob->x;
241
242         for(i=0;i<l;i++)
243                 Xv[i]=sparse_operator::dot(v, x[i]);
244 }
245
246 void l2r_erm_fun::XTv(double *v, double *XTv)
247 {
248         int i;
249         int l=prob->l;
250         int w_size=get_nr_variable();
251         feature_node **x=prob->x;
252
253         for(i=0;i<w_size;i++)
254                 XTv[i]=0;
255         for(i=0;i<l;i++)
256                 sparse_operator::axpy(v[i], x[i], XTv);
257 }
258
259 class l2r_lr_fun: public l2r_erm_fun
260 {
261 public:
262         l2r_lr_fun(const problem *prob, const parameter *param, double *C);
263         ~l2r_lr_fun();
264
265         void grad(double *w, double *g);
266         void Hv(double *s, double *Hs);
267
268         void get_diag_preconditioner(double *M);
269
270 private:
271         double *D;
272         double C_times_loss(int i, double wx_i);
273 };
274
275 l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C):
276         l2r_erm_fun(prob, param, C)
277 {
278         int l=prob->l;
279         D = new double[l];
280 }
281
282 l2r_lr_fun::~l2r_lr_fun()
283 {
284         delete[] D;
285 }
286
287 double l2r_lr_fun::C_times_loss(int i, double wx_i)
288 {
289         double ywx_i = wx_i * prob->y[i];
290         if (ywx_i >= 0)
291                 return C[i]*log(1 + exp(-ywx_i));
292         else
293                 return C[i]*(-ywx_i + log(1 + exp(ywx_i)));
294 }
295
296 void l2r_lr_fun::grad(double *w, double *g)
297 {
298         int i;
299         double *y=prob->y;
300         int l=prob->l;
301         int w_size=get_nr_variable();
302
303         for(i=0;i<l;i++)
304         {
305                 tmp[i] = 1/(1 + exp(-y[i]*wx[i]));
306                 D[i] = tmp[i]*(1-tmp[i]);
307                 tmp[i] = C[i]*(tmp[i]-1)*y[i];
308         }
309         XTv(tmp, g);
310
311         for(i=0;i<w_size;i++)
312                 g[i] = w[i] + g[i];
313         if(regularize_bias == 0)
314                 g[w_size-1] -= w[w_size-1];
315 }
316
317 void l2r_lr_fun::get_diag_preconditioner(double *M)
318 {
319         int i;
320         int l = prob->l;
321         int w_size=get_nr_variable();
322         feature_node **x = prob->x;
323
324         for (i=0; i<w_size; i++)
325                 M[i] = 1;
326         if(regularize_bias == 0)
327                 M[w_size-1] = 0;
328
329         for (i=0; i<l; i++)
330         {
331                 feature_node *xi = x[i];
332                 while (xi->index!=-1)
333                 {
334                         M[xi->index-1] += xi->value*xi->value*C[i]*D[i];
335                         xi++;
336                 }
337         }
338 }
339
340 void l2r_lr_fun::Hv(double *s, double *Hs)
341 {
342         int i;
343         int l=prob->l;
344         int w_size=get_nr_variable();
345         feature_node **x=prob->x;
346
347         for(i=0;i<w_size;i++)
348                 Hs[i] = 0;
349         for(i=0;i<l;i++)
350         {
351                 feature_node * const xi=x[i];
352                 double xTs = sparse_operator::dot(s, xi);
353
354                 xTs = C[i]*D[i]*xTs;
355
356                 sparse_operator::axpy(xTs, xi, Hs);
357         }
358         for(i=0;i<w_size;i++)
359                 Hs[i] = s[i] + Hs[i];
360         if(regularize_bias == 0)
361                 Hs[w_size-1] -= s[w_size-1];
362 }
363
364 class l2r_l2_svc_fun: public l2r_erm_fun
365 {
366 public:
367         l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
368         ~l2r_l2_svc_fun();
369
370         void grad(double *w, double *g);
371         void Hv(double *s, double *Hs);
372
373         void get_diag_preconditioner(double *M);
374
375 protected:
376         void subXTv(double *v, double *XTv);
377
378         int *I;
379         int sizeI;
380
381 private:
382         double C_times_loss(int i, double wx_i);
383 };
384
385 l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C):
386         l2r_erm_fun(prob, param, C)
387 {
388         I = new int[prob->l];
389 }
390
391 l2r_l2_svc_fun::~l2r_l2_svc_fun()
392 {
393         delete[] I;
394 }
395
396 double l2r_l2_svc_fun::C_times_loss(int i, double wx_i)
397 {
398         double d = 1 - prob->y[i] * wx_i;
399         if (d > 0)
400                 return C[i] * d * d;
401         else
402                 return 0;
403 }
404
405 void l2r_l2_svc_fun::grad(double *w, double *g)
406 {
407         int i;
408         double *y=prob->y;
409         int l=prob->l;
410         int w_size=get_nr_variable();
411
412         sizeI = 0;
413         for (i=0;i<l;i++)
414         {
415                 tmp[i] = wx[i] * y[i];
416                 if (tmp[i] < 1)
417                 {
418                         tmp[sizeI] = C[i]*y[i]*(tmp[i]-1);
419                         I[sizeI] = i;
420                         sizeI++;
421                 }
422         }
423         subXTv(tmp, g);
424
425         for(i=0;i<w_size;i++)
426                 g[i] = w[i] + 2*g[i];
427         if(regularize_bias == 0)
428                 g[w_size-1] -= w[w_size-1];
429 }
430
431 void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
432 {
433         int i;
434         int w_size=get_nr_variable();
435         feature_node **x = prob->x;
436
437         for (i=0; i<w_size; i++)
438                 M[i] = 1;
439         if(regularize_bias == 0)
440                 M[w_size-1] = 0;
441
442         for (i=0; i<sizeI; i++)
443         {
444                 int idx = I[i];
445                 feature_node *xi = x[idx];
446                 while (xi->index!=-1)
447                 {
448                         M[xi->index-1] += xi->value*xi->value*C[idx]*2;
449                         xi++;
450                 }
451         }
452 }
453
454 void l2r_l2_svc_fun::Hv(double *s, double *Hs)
455 {
456         int i;
457         int w_size=get_nr_variable();
458         feature_node **x=prob->x;
459
460         for(i=0;i<w_size;i++)
461                 Hs[i]=0;
462         for(i=0;i<sizeI;i++)
463         {
464                 feature_node * const xi=x[I[i]];
465                 double xTs = sparse_operator::dot(s, xi);
466
467                 xTs = C[I[i]]*xTs;
468
469                 sparse_operator::axpy(xTs, xi, Hs);
470         }
471         for(i=0;i<w_size;i++)
472                 Hs[i] = s[i] + 2*Hs[i];
473         if(regularize_bias == 0)
474                 Hs[w_size-1] -= s[w_size-1];
475 }
476
477 void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
478 {
479         int i;
480         int w_size=get_nr_variable();
481         feature_node **x=prob->x;
482
483         for(i=0;i<w_size;i++)
484                 XTv[i]=0;
485         for(i=0;i<sizeI;i++)
486                 sparse_operator::axpy(v[i], x[I[i]], XTv);
487 }
488
489 class l2r_l2_svr_fun: public l2r_l2_svc_fun
490 {
491 public:
492         l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
493
494         void grad(double *w, double *g);
495
496 private:
497         double C_times_loss(int i, double wx_i);
498         double p;
499 };
500
501 l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
502         l2r_l2_svc_fun(prob, param, C)
503 {
504         this->p = param->p;
505         this->regularize_bias = param->regularize_bias;
506 }
507
508 double l2r_l2_svr_fun::C_times_loss(int i, double wx_i)
509 {
510         double d = wx_i - prob->y[i];
511         if(d < -p)
512                 return C[i]*(d+p)*(d+p);
513         else if(d > p)
514                 return C[i]*(d-p)*(d-p);
515         return 0;
516 }
517
518 void l2r_l2_svr_fun::grad(double *w, double *g)
519 {
520         int i;
521         double *y=prob->y;
522         int l=prob->l;
523         int w_size=get_nr_variable();
524         double d;
525
526         sizeI = 0;
527         for(i=0;i<l;i++)
528         {
529                 d = wx[i] - y[i];
530
531                 // generate index set I
532                 if(d < -p)
533                 {
534                         tmp[sizeI] = C[i]*(d+p);
535                         I[sizeI] = i;
536                         sizeI++;
537                 }
538                 else if(d > p)
539                 {
540                         tmp[sizeI] = C[i]*(d-p);
541                         I[sizeI] = i;
542                         sizeI++;
543                 }
544
545         }
546         subXTv(tmp, g);
547
548         for(i=0;i<w_size;i++)
549                 g[i] = w[i] + 2*g[i];
550         if(regularize_bias == 0)
551                 g[w_size-1] -= w[w_size-1];
552 }
553
554 // A coordinate descent algorithm for
555 // multi-class support vector machines by Crammer and Singer
556 //
557 //  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
558 //    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
559 //
560 //  where e^m_i = 0 if y_i  = m,
561 //        e^m_i = 1 if y_i != m,
562 //  C^m_i = C if m  = y_i,
563 //  C^m_i = 0 if m != y_i,
564 //  and w_m(\alpha) = \sum_i \alpha^m_i x_i
565 //
566 // Given:
567 // x, y, C
568 // eps is the stopping tolerance
569 //
570 // solution will be put in w
571 //
572 // See Appendix of LIBLINEAR paper, Fan et al. (2008)
573
574 #define GETI(i) ((int) prob->y[i])
575 // To support weights for instances, use GETI(i) (i)
576
577 class Solver_MCSVM_CS
578 {
579         public:
580                 Solver_MCSVM_CS(const problem *prob, int nr_class, double *C, double eps=0.1, int max_iter=100000);
581                 ~Solver_MCSVM_CS();
582                 void Solve(double *w);
583         private:
584                 void solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new);
585                 bool be_shrunk(int i, int m, int yi, double alpha_i, double minG);
586                 double *B, *C, *G;
587                 int w_size, l;
588                 int nr_class;
589                 int max_iter;
590                 double eps;
591                 const problem *prob;
592 };
593
594 Solver_MCSVM_CS::Solver_MCSVM_CS(const problem *prob, int nr_class, double *weighted_C, double eps, int max_iter)
595 {
596         this->w_size = prob->n;
597         this->l = prob->l;
598         this->nr_class = nr_class;
599         this->eps = eps;
600         this->max_iter = max_iter;
601         this->prob = prob;
602         this->B = new double[nr_class];
603         this->G = new double[nr_class];
604         this->C = weighted_C;
605 }
606
607 Solver_MCSVM_CS::~Solver_MCSVM_CS()
608 {
609         delete[] B;
610         delete[] G;
611 }
612
613 int compare_double(const void *a, const void *b)
614 {
615         if(*(double *)a > *(double *)b)
616                 return -1;
617         if(*(double *)a < *(double *)b)
618                 return 1;
619         return 0;
620 }
621
622 void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new)
623 {
624         int r;
625         double *D;
626
627         clone(D, B, active_i);
628         if(yi < active_i)
629                 D[yi] += A_i*C_yi;
630         qsort(D, active_i, sizeof(double), compare_double);
631
632         double beta = D[0] - A_i*C_yi;
633         for(r=1;r<active_i && beta<r*D[r];r++)
634                 beta += D[r];
635         beta /= r;
636
637         for(r=0;r<active_i;r++)
638         {
639                 if(r == yi)
640                         alpha_new[r] = min(C_yi, (beta-B[r])/A_i);
641                 else
642                         alpha_new[r] = min((double)0, (beta - B[r])/A_i);
643         }
644         delete[] D;
645 }
646
647 bool Solver_MCSVM_CS::be_shrunk(int i, int m, int yi, double alpha_i, double minG)
648 {
649         double bound = 0;
650         if(m == yi)
651                 bound = C[GETI(i)];
652         if(alpha_i == bound && G[m] < minG)
653                 return true;
654         return false;
655 }
656
657 void Solver_MCSVM_CS::Solve(double *w)
658 {
659         int i, m, s;
660         int iter = 0;
661         double *alpha =  new double[l*nr_class];
662         double *alpha_new = new double[nr_class];
663         int *index = new int[l];
664         double *QD = new double[l];
665         int *d_ind = new int[nr_class];
666         double *d_val = new double[nr_class];
667         int *alpha_index = new int[nr_class*l];
668         int *y_index = new int[l];
669         int active_size = l;
670         int *active_size_i = new int[l];
671         double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
672         bool start_from_all = true;
673
674         // Initial alpha can be set here. Note that
675         // sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
676         // alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
677         // alpha[i*nr_class+m] <= 0 if prob->y[i] != m
678         // If initial alpha isn't zero, uncomment the for loop below to initialize w
679         for(i=0;i<l*nr_class;i++)
680                 alpha[i] = 0;
681
682         for(i=0;i<w_size*nr_class;i++)
683                 w[i] = 0;
684         for(i=0;i<l;i++)
685         {
686                 for(m=0;m<nr_class;m++)
687                         alpha_index[i*nr_class+m] = m;
688                 feature_node *xi = prob->x[i];
689                 QD[i] = 0;
690                 while(xi->index != -1)
691                 {
692                         double val = xi->value;
693                         QD[i] += val*val;
694
695                         // Uncomment the for loop if initial alpha isn't zero
696                         // for(m=0; m<nr_class; m++)
697                         //      w[(xi->index-1)*nr_class+m] += alpha[i*nr_class+m]*val;
698                         xi++;
699                 }
700                 active_size_i[i] = nr_class;
701                 y_index[i] = (int)prob->y[i];
702                 index[i] = i;
703         }
704
705         while(iter < max_iter)
706         {
707                 double stopping = -INF;
708                 for(i=0;i<active_size;i++)
709                 {
710                         int j = i+rand()%(active_size-i);
711                         swap(index[i], index[j]);
712                 }
713                 for(s=0;s<active_size;s++)
714                 {
715                         i = index[s];
716                         double Ai = QD[i];
717                         double *alpha_i = &alpha[i*nr_class];
718                         int *alpha_index_i = &alpha_index[i*nr_class];
719
720                         if(Ai > 0)
721                         {
722                                 for(m=0;m<active_size_i[i];m++)
723                                         G[m] = 1;
724                                 if(y_index[i] < active_size_i[i])
725                                         G[y_index[i]] = 0;
726
727                                 feature_node *xi = prob->x[i];
728                                 while(xi->index!= -1)
729                                 {
730                                         double *w_i = &w[(xi->index-1)*nr_class];
731                                         for(m=0;m<active_size_i[i];m++)
732                                                 G[m] += w_i[alpha_index_i[m]]*(xi->value);
733                                         xi++;
734                                 }
735
736                                 double minG = INF;
737                                 double maxG = -INF;
738                                 for(m=0;m<active_size_i[i];m++)
739                                 {
740                                         if(alpha_i[alpha_index_i[m]] < 0 && G[m] < minG)
741                                                 minG = G[m];
742                                         if(G[m] > maxG)
743                                                 maxG = G[m];
744                                 }
745                                 if(y_index[i] < active_size_i[i])
746                                         if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG)
747                                                 minG = G[y_index[i]];
748
749                                 for(m=0;m<active_size_i[i];m++)
750                                 {
751                                         if(be_shrunk(i, m, y_index[i], alpha_i[alpha_index_i[m]], minG))
752                                         {
753                                                 active_size_i[i]--;
754                                                 while(active_size_i[i]>m)
755                                                 {
756                                                         if(!be_shrunk(i, active_size_i[i], y_index[i],
757                                                                                         alpha_i[alpha_index_i[active_size_i[i]]], minG))
758                                                         {
759                                                                 swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]);
760                                                                 swap(G[m], G[active_size_i[i]]);
761                                                                 if(y_index[i] == active_size_i[i])
762                                                                         y_index[i] = m;
763                                                                 else if(y_index[i] == m)
764                                                                         y_index[i] = active_size_i[i];
765                                                                 break;
766                                                         }
767                                                         active_size_i[i]--;
768                                                 }
769                                         }
770                                 }
771
772                                 if(active_size_i[i] <= 1)
773                                 {
774                                         active_size--;
775                                         swap(index[s], index[active_size]);
776                                         s--;
777                                         continue;
778                                 }
779
780                                 if(maxG-minG <= 1e-12)
781                                         continue;
782                                 else
783                                         stopping = max(maxG - minG, stopping);
784
785                                 for(m=0;m<active_size_i[i];m++)
786                                         B[m] = G[m] - Ai*alpha_i[alpha_index_i[m]] ;
787
788                                 solve_sub_problem(Ai, y_index[i], C[GETI(i)], active_size_i[i], alpha_new);
789                                 int nz_d = 0;
790                                 for(m=0;m<active_size_i[i];m++)
791                                 {
792                                         double d = alpha_new[m] - alpha_i[alpha_index_i[m]];
793                                         alpha_i[alpha_index_i[m]] = alpha_new[m];
794                                         if(fabs(d) >= 1e-12)
795                                         {
796                                                 d_ind[nz_d] = alpha_index_i[m];
797                                                 d_val[nz_d] = d;
798                                                 nz_d++;
799                                         }
800                                 }
801
802                                 xi = prob->x[i];
803                                 while(xi->index != -1)
804                                 {
805                                         double *w_i = &w[(xi->index-1)*nr_class];
806                                         for(m=0;m<nz_d;m++)
807                                                 w_i[d_ind[m]] += d_val[m]*xi->value;
808                                         xi++;
809                                 }
810                         }
811                 }
812
813                 iter++;
814                 if(iter % 10 == 0)
815                 {
816                         info(".");
817                 }
818
819                 if(stopping < eps_shrink)
820                 {
821                         if(stopping < eps && start_from_all == true)
822                                 break;
823                         else
824                         {
825                                 active_size = l;
826                                 for(i=0;i<l;i++)
827                                         active_size_i[i] = nr_class;
828                                 info("*");
829                                 eps_shrink = max(eps_shrink/2, eps);
830                                 start_from_all = true;
831                         }
832                 }
833                 else
834                         start_from_all = false;
835         }
836
837         info("\noptimization finished, #iter = %d\n",iter);
838         if (iter >= max_iter)
839                 info("\nWARNING: reaching max number of iterations\n");
840
841         // calculate objective value
842         double v = 0;
843         int nSV = 0;
844         for(i=0;i<w_size*nr_class;i++)
845                 v += w[i]*w[i];
846         v = 0.5*v;
847         for(i=0;i<l*nr_class;i++)
848         {
849                 v += alpha[i];
850                 if(fabs(alpha[i]) > 0)
851                         nSV++;
852         }
853         for(i=0;i<l;i++)
854                 v -= alpha[i*nr_class+(int)prob->y[i]];
855         info("Objective value = %lf\n",v);
856         info("nSV = %d\n",nSV);
857
858         delete [] alpha;
859         delete [] alpha_new;
860         delete [] index;
861         delete [] QD;
862         delete [] d_ind;
863         delete [] d_val;
864         delete [] alpha_index;
865         delete [] y_index;
866         delete [] active_size_i;
867 }
868
869 // A coordinate descent algorithm for
870 // L1-loss and L2-loss SVM dual problems
871 //
872 //  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
873 //    s.t.      0 <= \alpha_i <= upper_bound_i,
874 //
875 //  where Qij = yi yj xi^T xj and
876 //  D is a diagonal matrix
877 //
878 // In L1-SVM case:
879 //              upper_bound_i = Cp if y_i = 1
880 //              upper_bound_i = Cn if y_i = -1
881 //              D_ii = 0
882 // In L2-SVM case:
883 //              upper_bound_i = INF
884 //              D_ii = 1/(2*Cp) if y_i = 1
885 //              D_ii = 1/(2*Cn) if y_i = -1
886 //
887 // Given:
888 // x, y, Cp, Cn
889 // eps is the stopping tolerance
890 //
891 // solution will be put in w
892 //
893 // See Algorithm 3 of Hsieh et al., ICML 2008
894
895 #undef GETI
896 #define GETI(i) (y[i]+1)
897 // To support weights for instances, use GETI(i) (i)
898
899 static void solve_l2r_l1l2_svc(
900         const problem *prob, double *w, double eps,
901         double Cp, double Cn, int solver_type)
902 {
903         int l = prob->l;
904         int w_size = prob->n;
905         int i, s, iter = 0;
906         double C, d, G;
907         double *QD = new double[l];
908         int max_iter = 1000;
909         int *index = new int[l];
910         double *alpha = new double[l];
911         schar *y = new schar[l];
912         int active_size = l;
913
914         // PG: projected gradient, for shrinking and stopping
915         double PG;
916         double PGmax_old = INF;
917         double PGmin_old = -INF;
918         double PGmax_new, PGmin_new;
919
920         // default solver_type: L2R_L2LOSS_SVC_DUAL
921         double diag[3] = {0.5/Cn, 0, 0.5/Cp};
922         double upper_bound[3] = {INF, 0, INF};
923         if(solver_type == L2R_L1LOSS_SVC_DUAL)
924         {
925                 diag[0] = 0;
926                 diag[2] = 0;
927                 upper_bound[0] = Cn;
928                 upper_bound[2] = Cp;
929         }
930
931         for(i=0; i<l; i++)
932         {
933                 if(prob->y[i] > 0)
934                 {
935                         y[i] = +1;
936                 }
937                 else
938                 {
939                         y[i] = -1;
940                 }
941         }
942
943         // Initial alpha can be set here. Note that
944         // 0 <= alpha[i] <= upper_bound[GETI(i)]
945         for(i=0; i<l; i++)
946                 alpha[i] = 0;
947
948         for(i=0; i<w_size; i++)
949                 w[i] = 0;
950         for(i=0; i<l; i++)
951         {
952                 QD[i] = diag[GETI(i)];
953
954                 feature_node * const xi = prob->x[i];
955                 QD[i] += sparse_operator::nrm2_sq(xi);
956                 sparse_operator::axpy(y[i]*alpha[i], xi, w);
957
958                 index[i] = i;
959         }
960
961         while (iter < max_iter)
962         {
963                 PGmax_new = -INF;
964                 PGmin_new = INF;
965
966                 for (i=0; i<active_size; i++)
967                 {
968                         int j = i+rand()%(active_size-i);
969                         swap(index[i], index[j]);
970                 }
971
972                 for (s=0; s<active_size; s++)
973                 {
974                         i = index[s];
975                         const schar yi = y[i];
976                         feature_node * const xi = prob->x[i];
977
978                         G = yi*sparse_operator::dot(w, xi)-1;
979
980                         C = upper_bound[GETI(i)];
981                         G += alpha[i]*diag[GETI(i)];
982
983                         PG = 0;
984                         if (alpha[i] == 0)
985                         {
986                                 if (G > PGmax_old)
987                                 {
988                                         active_size--;
989                                         swap(index[s], index[active_size]);
990                                         s--;
991                                         continue;
992                                 }
993                                 else if (G < 0)
994                                         PG = G;
995                         }
996                         else if (alpha[i] == C)
997                         {
998                                 if (G < PGmin_old)
999                                 {
1000                                         active_size--;
1001                                         swap(index[s], index[active_size]);
1002                                         s--;
1003                                         continue;
1004                                 }
1005                                 else if (G > 0)
1006                                         PG = G;
1007                         }
1008                         else
1009                                 PG = G;
1010
1011                         PGmax_new = max(PGmax_new, PG);
1012                         PGmin_new = min(PGmin_new, PG);
1013
1014                         if(fabs(PG) > 1.0e-12)
1015                         {
1016                                 double alpha_old = alpha[i];
1017                                 alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C);
1018                                 d = (alpha[i] - alpha_old)*yi;
1019                                 sparse_operator::axpy(d, xi, w);
1020                         }
1021                 }
1022
1023                 iter++;
1024                 if(iter % 10 == 0)
1025                         info(".");
1026
1027                 if(PGmax_new - PGmin_new <= eps)
1028                 {
1029                         if(active_size == l)
1030                                 break;
1031                         else
1032                         {
1033                                 active_size = l;
1034                                 info("*");
1035                                 PGmax_old = INF;
1036                                 PGmin_old = -INF;
1037                                 continue;
1038                         }
1039                 }
1040                 PGmax_old = PGmax_new;
1041                 PGmin_old = PGmin_new;
1042                 if (PGmax_old <= 0)
1043                         PGmax_old = INF;
1044                 if (PGmin_old >= 0)
1045                         PGmin_old = -INF;
1046         }
1047
1048         info("\noptimization finished, #iter = %d\n",iter);
1049         if (iter >= max_iter)
1050                 info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n");
1051
1052         // calculate objective value
1053
1054         double v = 0;
1055         int nSV = 0;
1056         for(i=0; i<w_size; i++)
1057                 v += w[i]*w[i];
1058         for(i=0; i<l; i++)
1059         {
1060                 v += alpha[i]*(alpha[i]*diag[GETI(i)] - 2);
1061                 if(alpha[i] > 0)
1062                         ++nSV;
1063         }
1064         info("Objective value = %lf\n",v/2);
1065         info("nSV = %d\n",nSV);
1066
1067         delete [] QD;
1068         delete [] alpha;
1069         delete [] y;
1070         delete [] index;
1071 }
1072
1073
1074 // A coordinate descent algorithm for
1075 // L1-loss and L2-loss epsilon-SVR dual problem
1076 //
1077 //  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
1078 //    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
1079 //
1080 //  where Qij = xi^T xj and
1081 //  D is a diagonal matrix
1082 //
1083 // In L1-SVM case:
1084 //              upper_bound_i = C
1085 //              lambda_i = 0
1086 // In L2-SVM case:
1087 //              upper_bound_i = INF
1088 //              lambda_i = 1/(2*C)
1089 //
1090 // Given:
1091 // x, y, p, C
1092 // eps is the stopping tolerance
1093 //
1094 // solution will be put in w
1095 //
1096 // See Algorithm 4 of Ho and Lin, 2012
1097
1098 #undef GETI
1099 #define GETI(i) (0)
1100 // To support weights for instances, use GETI(i) (i)
1101
1102 static void solve_l2r_l1l2_svr(
1103         const problem *prob, double *w, const parameter *param,
1104         int solver_type)
1105 {
1106         int l = prob->l;
1107         double C = param->C;
1108         double p = param->p;
1109         int w_size = prob->n;
1110         double eps = param->eps;
1111         int i, s, iter = 0;
1112         int max_iter = 1000;
1113         int active_size = l;
1114         int *index = new int[l];
1115
1116         double d, G, H;
1117         double Gmax_old = INF;
1118         double Gmax_new, Gnorm1_new;
1119         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1120         double *beta = new double[l];
1121         double *QD = new double[l];
1122         double *y = prob->y;
1123
1124         // L2R_L2LOSS_SVR_DUAL
1125         double lambda[1], upper_bound[1];
1126         lambda[0] = 0.5/C;
1127         upper_bound[0] = INF;
1128
1129         if(solver_type == L2R_L1LOSS_SVR_DUAL)
1130         {
1131                 lambda[0] = 0;
1132                 upper_bound[0] = C;
1133         }
1134
1135         // Initial beta can be set here. Note that
1136         // -upper_bound <= beta[i] <= upper_bound
1137         for(i=0; i<l; i++)
1138                 beta[i] = 0;
1139
1140         for(i=0; i<w_size; i++)
1141                 w[i] = 0;
1142         for(i=0; i<l; i++)
1143         {
1144                 feature_node * const xi = prob->x[i];
1145                 QD[i] = sparse_operator::nrm2_sq(xi);
1146                 sparse_operator::axpy(beta[i], xi, w);
1147
1148                 index[i] = i;
1149         }
1150
1151
1152         while(iter < max_iter)
1153         {
1154                 Gmax_new = 0;
1155                 Gnorm1_new = 0;
1156
1157                 for(i=0; i<active_size; i++)
1158                 {
1159                         int j = i+rand()%(active_size-i);
1160                         swap(index[i], index[j]);
1161                 }
1162
1163                 for(s=0; s<active_size; s++)
1164                 {
1165                         i = index[s];
1166                         G = -y[i] + lambda[GETI(i)]*beta[i];
1167                         H = QD[i] + lambda[GETI(i)];
1168
1169                         feature_node * const xi = prob->x[i];
1170                         G += sparse_operator::dot(w, xi);
1171
1172                         double Gp = G+p;
1173                         double Gn = G-p;
1174                         double violation = 0;
1175                         if(beta[i] == 0)
1176                         {
1177                                 if(Gp < 0)
1178                                         violation = -Gp;
1179                                 else if(Gn > 0)
1180                                         violation = Gn;
1181                                 else if(Gp>Gmax_old && Gn<-Gmax_old)
1182                                 {
1183                                         active_size--;
1184                                         swap(index[s], index[active_size]);
1185                                         s--;
1186                                         continue;
1187                                 }
1188                         }
1189                         else if(beta[i] >= upper_bound[GETI(i)])
1190                         {
1191                                 if(Gp > 0)
1192                                         violation = Gp;
1193                                 else if(Gp < -Gmax_old)
1194                                 {
1195                                         active_size--;
1196                                         swap(index[s], index[active_size]);
1197                                         s--;
1198                                         continue;
1199                                 }
1200                         }
1201                         else if(beta[i] <= -upper_bound[GETI(i)])
1202                         {
1203                                 if(Gn < 0)
1204                                         violation = -Gn;
1205                                 else if(Gn > Gmax_old)
1206                                 {
1207                                         active_size--;
1208                                         swap(index[s], index[active_size]);
1209                                         s--;
1210                                         continue;
1211                                 }
1212                         }
1213                         else if(beta[i] > 0)
1214                                 violation = fabs(Gp);
1215                         else
1216                                 violation = fabs(Gn);
1217
1218                         Gmax_new = max(Gmax_new, violation);
1219                         Gnorm1_new += violation;
1220
1221                         // obtain Newton direction d
1222                         if(Gp < H*beta[i])
1223                                 d = -Gp/H;
1224                         else if(Gn > H*beta[i])
1225                                 d = -Gn/H;
1226                         else
1227                                 d = -beta[i];
1228
1229                         if(fabs(d) < 1.0e-12)
1230                                 continue;
1231
1232                         double beta_old = beta[i];
1233                         beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]);
1234                         d = beta[i]-beta_old;
1235
1236                         if(d != 0)
1237                                 sparse_operator::axpy(d, xi, w);
1238                 }
1239
1240                 if(iter == 0)
1241                         Gnorm1_init = Gnorm1_new;
1242                 iter++;
1243                 if(iter % 10 == 0)
1244                         info(".");
1245
1246                 if(Gnorm1_new <= eps*Gnorm1_init)
1247                 {
1248                         if(active_size == l)
1249                                 break;
1250                         else
1251                         {
1252                                 active_size = l;
1253                                 info("*");
1254                                 Gmax_old = INF;
1255                                 continue;
1256                         }
1257                 }
1258
1259                 Gmax_old = Gmax_new;
1260         }
1261
1262         info("\noptimization finished, #iter = %d\n", iter);
1263         if(iter >= max_iter)
1264                 info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster\n\n");
1265
1266         // calculate objective value
1267         double v = 0;
1268         int nSV = 0;
1269         for(i=0; i<w_size; i++)
1270                 v += w[i]*w[i];
1271         v = 0.5*v;
1272         for(i=0; i<l; i++)
1273         {
1274                 v += p*fabs(beta[i]) - y[i]*beta[i] + 0.5*lambda[GETI(i)]*beta[i]*beta[i];
1275                 if(beta[i] != 0)
1276                         nSV++;
1277         }
1278
1279         info("Objective value = %lf\n", v);
1280         info("nSV = %d\n",nSV);
1281
1282         delete [] beta;
1283         delete [] QD;
1284         delete [] index;
1285 }
1286
1287
1288 // A coordinate descent algorithm for
1289 // the dual of L2-regularized logistic regression problems
1290 //
1291 //  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
1292 //    s.t.      0 <= \alpha_i <= upper_bound_i,
1293 //
1294 //  where Qij = yi yj xi^T xj and
1295 //  upper_bound_i = Cp if y_i = 1
1296 //  upper_bound_i = Cn if y_i = -1
1297 //
1298 // Given:
1299 // x, y, Cp, Cn
1300 // eps is the stopping tolerance
1301 //
1302 // solution will be put in w
1303 //
1304 // See Algorithm 5 of Yu et al., MLJ 2010
1305
1306 #undef GETI
1307 #define GETI(i) (y[i]+1)
1308 // To support weights for instances, use GETI(i) (i)
1309
1310 void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn)
1311 {
1312         int l = prob->l;
1313         int w_size = prob->n;
1314         int i, s, iter = 0;
1315         double *xTx = new double[l];
1316         int max_iter = 1000;
1317         int *index = new int[l];
1318         double *alpha = new double[2*l]; // store alpha and C - alpha
1319         schar *y = new schar[l];
1320         int max_inner_iter = 100; // for inner Newton
1321         double innereps = 1e-2;
1322         double innereps_min = min(1e-8, eps);
1323         double upper_bound[3] = {Cn, 0, Cp};
1324
1325         for(i=0; i<l; i++)
1326         {
1327                 if(prob->y[i] > 0)
1328                 {
1329                         y[i] = +1;
1330                 }
1331                 else
1332                 {
1333                         y[i] = -1;
1334                 }
1335         }
1336
1337         // Initial alpha can be set here. Note that
1338         // 0 < alpha[i] < upper_bound[GETI(i)]
1339         // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)]
1340         for(i=0; i<l; i++)
1341         {
1342                 alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
1343                 alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
1344         }
1345
1346         for(i=0; i<w_size; i++)
1347                 w[i] = 0;
1348         for(i=0; i<l; i++)
1349         {
1350                 feature_node * const xi = prob->x[i];
1351                 xTx[i] = sparse_operator::nrm2_sq(xi);
1352                 sparse_operator::axpy(y[i]*alpha[2*i], xi, w);
1353                 index[i] = i;
1354         }
1355
1356         while (iter < max_iter)
1357         {
1358                 for (i=0; i<l; i++)
1359                 {
1360                         int j = i+rand()%(l-i);
1361                         swap(index[i], index[j]);
1362                 }
1363                 int newton_iter = 0;
1364                 double Gmax = 0;
1365                 for (s=0; s<l; s++)
1366                 {
1367                         i = index[s];
1368                         const schar yi = y[i];
1369                         double C = upper_bound[GETI(i)];
1370                         double ywTx = 0, xisq = xTx[i];
1371                         feature_node * const xi = prob->x[i];
1372                         ywTx = yi*sparse_operator::dot(w, xi);
1373                         double a = xisq, b = ywTx;
1374
1375                         // Decide to minimize g_1(z) or g_2(z)
1376                         int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
1377                         if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0)
1378                         {
1379                                 ind1 = 2*i+1;
1380                                 ind2 = 2*i;
1381                                 sign = -1;
1382                         }
1383
1384                         //  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
1385                         double alpha_old = alpha[ind1];
1386                         double z = alpha_old;
1387                         if(C - z < 0.5 * C)
1388                                 z = 0.1*z;
1389                         double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1390                         Gmax = max(Gmax, fabs(gp));
1391
1392                         // Newton method on the sub-problem
1393                         const double eta = 0.1; // xi in the paper
1394                         int inner_iter = 0;
1395                         while (inner_iter <= max_inner_iter)
1396                         {
1397                                 if(fabs(gp) < innereps)
1398                                         break;
1399                                 double gpp = a + C/(C-z)/z;
1400                                 double tmpz = z - gp/gpp;
1401                                 if(tmpz <= 0)
1402                                         z *= eta;
1403                                 else // tmpz in (0, C)
1404                                         z = tmpz;
1405                                 gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1406                                 newton_iter++;
1407                                 inner_iter++;
1408                         }
1409
1410                         if(inner_iter > 0) // update w
1411                         {
1412                                 alpha[ind1] = z;
1413                                 alpha[ind2] = C-z;
1414                                 sparse_operator::axpy(sign*(z-alpha_old)*yi, xi, w);
1415                         }
1416                 }
1417
1418                 iter++;
1419                 if(iter % 10 == 0)
1420                         info(".");
1421
1422                 if(Gmax < eps)
1423                         break;
1424
1425                 if(newton_iter <= l/10)
1426                         innereps = max(innereps_min, 0.1*innereps);
1427
1428         }
1429
1430         info("\noptimization finished, #iter = %d\n",iter);
1431         if (iter >= max_iter)
1432                 info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n");
1433
1434         // calculate objective value
1435
1436         double v = 0;
1437         for(i=0; i<w_size; i++)
1438                 v += w[i] * w[i];
1439         v *= 0.5;
1440         for(i=0; i<l; i++)
1441                 v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1])
1442                         - upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
1443         info("Objective value = %lf\n", v);
1444
1445         delete [] xTx;
1446         delete [] alpha;
1447         delete [] y;
1448         delete [] index;
1449 }
1450
1451 // A coordinate descent algorithm for
1452 // L1-regularized L2-loss support vector classification
1453 //
1454 //  min_w \sum |wj| + C \sum max(0, 1-yi w^T xi)^2,
1455 //
1456 // Given:
1457 // x, y, Cp, Cn
1458 // eps is the stopping tolerance
1459 //
1460 // solution will be put in w
1461 //
1462 // See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
1463 //
1464 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1465 // must have been added to the original data. (see -B and -R option)
1466
1467 #undef GETI
1468 #define GETI(i) (y[i]+1)
1469 // To support weights for instances, use GETI(i) (i)
1470
1471 static void solve_l1r_l2_svc(
1472         problem *prob_col, double *w, double eps,
1473         double Cp, double Cn, int regularize_bias)
1474 {
1475         int l = prob_col->l;
1476         int w_size = prob_col->n;
1477         int j, s, iter = 0;
1478         int max_iter = 1000;
1479         int active_size = w_size;
1480         int max_num_linesearch = 20;
1481
1482         double sigma = 0.01;
1483         double d, G_loss, G, H;
1484         double Gmax_old = INF;
1485         double Gmax_new, Gnorm1_new;
1486         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1487         double d_old, d_diff;
1488         double loss_old = 0, loss_new;
1489         double appxcond, cond;
1490
1491         int *index = new int[w_size];
1492         schar *y = new schar[l];
1493         double *b = new double[l]; // b = 1-ywTx
1494         double *xj_sq = new double[w_size];
1495         feature_node *x;
1496
1497         double C[3] = {Cn,0,Cp};
1498
1499         // Initial w can be set here.
1500         for(j=0; j<w_size; j++)
1501                 w[j] = 0;
1502
1503         for(j=0; j<l; j++)
1504         {
1505                 b[j] = 1;
1506                 if(prob_col->y[j] > 0)
1507                         y[j] = 1;
1508                 else
1509                         y[j] = -1;
1510         }
1511         for(j=0; j<w_size; j++)
1512         {
1513                 index[j] = j;
1514                 xj_sq[j] = 0;
1515                 x = prob_col->x[j];
1516                 while(x->index != -1)
1517                 {
1518                         int ind = x->index-1;
1519                         x->value *= y[ind]; // x->value stores yi*xij
1520                         double val = x->value;
1521                         b[ind] -= w[j]*val;
1522                         xj_sq[j] += C[GETI(ind)]*val*val;
1523                         x++;
1524                 }
1525         }
1526
1527         while(iter < max_iter)
1528         {
1529                 Gmax_new = 0;
1530                 Gnorm1_new = 0;
1531
1532                 for(j=0; j<active_size; j++)
1533                 {
1534                         int i = j+rand()%(active_size-j);
1535                         swap(index[i], index[j]);
1536                 }
1537
1538                 for(s=0; s<active_size; s++)
1539                 {
1540                         j = index[s];
1541                         G_loss = 0;
1542                         H = 0;
1543
1544                         x = prob_col->x[j];
1545                         while(x->index != -1)
1546                         {
1547                                 int ind = x->index-1;
1548                                 if(b[ind] > 0)
1549                                 {
1550                                         double val = x->value;
1551                                         double tmp = C[GETI(ind)]*val;
1552                                         G_loss -= tmp*b[ind];
1553                                         H += tmp*val;
1554                                 }
1555                                 x++;
1556                         }
1557                         G_loss *= 2;
1558
1559                         G = G_loss;
1560                         H *= 2;
1561                         H = max(H, 1e-12);
1562
1563                         double violation = 0;
1564                         double Gp = 0, Gn = 0;
1565                         if(j == w_size-1 && regularize_bias == 0)
1566                                 violation = fabs(G);
1567                         else
1568                         {
1569                                 Gp = G+1;
1570                                 Gn = G-1;
1571                                 if(w[j] == 0)
1572                                 {
1573                                         if(Gp < 0)
1574                                                 violation = -Gp;
1575                                         else if(Gn > 0)
1576                                                 violation = Gn;
1577                                         else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1578                                         {
1579                                                 active_size--;
1580                                                 swap(index[s], index[active_size]);
1581                                                 s--;
1582                                                 continue;
1583                                         }
1584                                 }
1585                                 else if(w[j] > 0)
1586                                         violation = fabs(Gp);
1587                                 else
1588                                         violation = fabs(Gn);
1589                         }
1590                         Gmax_new = max(Gmax_new, violation);
1591                         Gnorm1_new += violation;
1592
1593                         // obtain Newton direction d
1594                         if(j == w_size-1 && regularize_bias == 0)
1595                                 d = -G/H;
1596                         else
1597                         {
1598                                 if(Gp < H*w[j])
1599                                         d = -Gp/H;
1600                                 else if(Gn > H*w[j])
1601                                         d = -Gn/H;
1602                                 else
1603                                         d = -w[j];
1604                         }
1605
1606                         if(fabs(d) < 1.0e-12)
1607                                 continue;
1608
1609                         double delta;
1610                         if(j == w_size-1 && regularize_bias == 0)
1611                                 delta = G*d;
1612                         else
1613                                 delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
1614                         d_old = 0;
1615                         int num_linesearch;
1616                         for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
1617                         {
1618                                 d_diff = d_old - d;
1619                                 if(j == w_size-1 && regularize_bias == 0)
1620                                         cond = -sigma*delta;
1621                                 else
1622                                         cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
1623
1624                                 appxcond = xj_sq[j]*d*d + G_loss*d + cond;
1625                                 if(appxcond <= 0)
1626                                 {
1627                                         x = prob_col->x[j];
1628                                         sparse_operator::axpy(d_diff, x, b);
1629                                         break;
1630                                 }
1631
1632                                 if(num_linesearch == 0)
1633                                 {
1634                                         loss_old = 0;
1635                                         loss_new = 0;
1636                                         x = prob_col->x[j];
1637                                         while(x->index != -1)
1638                                         {
1639                                                 int ind = x->index-1;
1640                                                 if(b[ind] > 0)
1641                                                         loss_old += C[GETI(ind)]*b[ind]*b[ind];
1642                                                 double b_new = b[ind] + d_diff*x->value;
1643                                                 b[ind] = b_new;
1644                                                 if(b_new > 0)
1645                                                         loss_new += C[GETI(ind)]*b_new*b_new;
1646                                                 x++;
1647                                         }
1648                                 }
1649                                 else
1650                                 {
1651                                         loss_new = 0;
1652                                         x = prob_col->x[j];
1653                                         while(x->index != -1)
1654                                         {
1655                                                 int ind = x->index-1;
1656                                                 double b_new = b[ind] + d_diff*x->value;
1657                                                 b[ind] = b_new;
1658                                                 if(b_new > 0)
1659                                                         loss_new += C[GETI(ind)]*b_new*b_new;
1660                                                 x++;
1661                                         }
1662                                 }
1663
1664                                 cond = cond + loss_new - loss_old;
1665                                 if(cond <= 0)
1666                                         break;
1667                                 else
1668                                 {
1669                                         d_old = d;
1670                                         d *= 0.5;
1671                                         delta *= 0.5;
1672                                 }
1673                         }
1674
1675                         w[j] += d;
1676
1677                         // recompute b[] if line search takes too many steps
1678                         if(num_linesearch >= max_num_linesearch)
1679                         {
1680                                 info("#");
1681                                 for(int i=0; i<l; i++)
1682                                         b[i] = 1;
1683
1684                                 for(int i=0; i<w_size; i++)
1685                                 {
1686                                         if(w[i]==0) continue;
1687                                         x = prob_col->x[i];
1688                                         sparse_operator::axpy(-w[i], x, b);
1689                                 }
1690                         }
1691                 }
1692
1693                 if(iter == 0)
1694                         Gnorm1_init = Gnorm1_new;
1695                 iter++;
1696                 if(iter % 10 == 0)
1697                         info(".");
1698
1699                 if(Gnorm1_new <= eps*Gnorm1_init)
1700                 {
1701                         if(active_size == w_size)
1702                                 break;
1703                         else
1704                         {
1705                                 active_size = w_size;
1706                                 info("*");
1707                                 Gmax_old = INF;
1708                                 continue;
1709                         }
1710                 }
1711
1712                 Gmax_old = Gmax_new;
1713         }
1714
1715         info("\noptimization finished, #iter = %d\n", iter);
1716         if(iter >= max_iter)
1717                 info("\nWARNING: reaching max number of iterations\n");
1718
1719         // calculate objective value
1720
1721         double v = 0;
1722         int nnz = 0;
1723         for(j=0; j<w_size; j++)
1724         {
1725                 x = prob_col->x[j];
1726                 while(x->index != -1)
1727                 {
1728                         x->value *= prob_col->y[x->index-1]; // restore x->value
1729                         x++;
1730                 }
1731                 if(w[j] != 0)
1732                 {
1733                         v += fabs(w[j]);
1734                         nnz++;
1735                 }
1736         }
1737         if (regularize_bias == 0)
1738                 v -= fabs(w[w_size-1]);
1739         for(j=0; j<l; j++)
1740                 if(b[j] > 0)
1741                         v += C[GETI(j)]*b[j]*b[j];
1742
1743         info("Objective value = %lf\n", v);
1744         info("#nonzeros/#features = %d/%d\n", nnz, w_size);
1745
1746         delete [] index;
1747         delete [] y;
1748         delete [] b;
1749         delete [] xj_sq;
1750 }
1751
1752 // A coordinate descent algorithm for
1753 // L1-regularized logistic regression problems
1754 //
1755 //  min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)),
1756 //
1757 // Given:
1758 // x, y, Cp, Cn
1759 // eps is the stopping tolerance
1760 //
1761 // solution will be put in w
1762 //
1763 // See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
1764 //
1765 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1766 // must have been added to the original data. (see -B and -R option)
1767
1768 #undef GETI
1769 #define GETI(i) (y[i]+1)
1770 // To support weights for instances, use GETI(i) (i)
1771
1772 static void solve_l1r_lr(
1773         const problem *prob_col, double *w, double eps,
1774         double Cp, double Cn, int regularize_bias)
1775 {
1776         int l = prob_col->l;
1777         int w_size = prob_col->n;
1778         int j, s, newton_iter=0, iter=0;
1779         int max_newton_iter = 100;
1780         int max_iter = 1000;
1781         int max_num_linesearch = 20;
1782         int active_size;
1783         int QP_active_size;
1784
1785         double nu = 1e-12;
1786         double inner_eps = 1;
1787         double sigma = 0.01;
1788         double w_norm, w_norm_new;
1789         double z, G, H;
1790         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1791         double Gmax_old = INF;
1792         double Gmax_new, Gnorm1_new;
1793         double QP_Gmax_old = INF;
1794         double QP_Gmax_new, QP_Gnorm1_new;
1795         double delta, negsum_xTd, cond;
1796
1797         int *index = new int[w_size];
1798         schar *y = new schar[l];
1799         double *Hdiag = new double[w_size];
1800         double *Grad = new double[w_size];
1801         double *wpd = new double[w_size];
1802         double *xjneg_sum = new double[w_size];
1803         double *xTd = new double[l];
1804         double *exp_wTx = new double[l];
1805         double *exp_wTx_new = new double[l];
1806         double *tau = new double[l];
1807         double *D = new double[l];
1808         feature_node *x;
1809
1810         double C[3] = {Cn,0,Cp};
1811
1812         // Initial w can be set here.
1813         for(j=0; j<w_size; j++)
1814                 w[j] = 0;
1815
1816         for(j=0; j<l; j++)
1817         {
1818                 if(prob_col->y[j] > 0)
1819                         y[j] = 1;
1820                 else
1821                         y[j] = -1;
1822
1823                 exp_wTx[j] = 0;
1824         }
1825
1826         w_norm = 0;
1827         for(j=0; j<w_size; j++)
1828         {
1829                 w_norm += fabs(w[j]);
1830                 wpd[j] = w[j];
1831                 index[j] = j;
1832                 xjneg_sum[j] = 0;
1833                 x = prob_col->x[j];
1834                 while(x->index != -1)
1835                 {
1836                         int ind = x->index-1;
1837                         double val = x->value;
1838                         exp_wTx[ind] += w[j]*val;
1839                         if(y[ind] == -1)
1840                                 xjneg_sum[j] += C[GETI(ind)]*val;
1841                         x++;
1842                 }
1843         }
1844         if (regularize_bias == 0)
1845                 w_norm -= fabs(w[w_size-1]);
1846
1847         for(j=0; j<l; j++)
1848         {
1849                 exp_wTx[j] = exp(exp_wTx[j]);
1850                 double tau_tmp = 1/(1+exp_wTx[j]);
1851                 tau[j] = C[GETI(j)]*tau_tmp;
1852                 D[j] = C[GETI(j)]*exp_wTx[j]*tau_tmp*tau_tmp;
1853         }
1854
1855         while(newton_iter < max_newton_iter)
1856         {
1857                 Gmax_new = 0;
1858                 Gnorm1_new = 0;
1859                 active_size = w_size;
1860
1861                 for(s=0; s<active_size; s++)
1862                 {
1863                         j = index[s];
1864                         Hdiag[j] = nu;
1865                         Grad[j] = 0;
1866
1867                         double tmp = 0;
1868                         x = prob_col->x[j];
1869                         while(x->index != -1)
1870                         {
1871                                 int ind = x->index-1;
1872                                 Hdiag[j] += x->value*x->value*D[ind];
1873                                 tmp += x->value*tau[ind];
1874                                 x++;
1875                         }
1876                         Grad[j] = -tmp + xjneg_sum[j];
1877
1878                         double violation = 0;
1879                         if (j == w_size-1 && regularize_bias == 0)
1880                                 violation = fabs(Grad[j]);
1881                         else
1882                         {
1883                                 double Gp = Grad[j]+1;
1884                                 double Gn = Grad[j]-1;
1885                                 if(w[j] == 0)
1886                                 {
1887                                         if(Gp < 0)
1888                                                 violation = -Gp;
1889                                         else if(Gn > 0)
1890                                                 violation = Gn;
1891                                         //outer-level shrinking
1892                                         else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1893                                         {
1894                                                 active_size--;
1895                                                 swap(index[s], index[active_size]);
1896                                                 s--;
1897                                                 continue;
1898                                         }
1899                                 }
1900                                 else if(w[j] > 0)
1901                                         violation = fabs(Gp);
1902                                 else
1903                                         violation = fabs(Gn);
1904                         }
1905                         Gmax_new = max(Gmax_new, violation);
1906                         Gnorm1_new += violation;
1907                 }
1908
1909                 if(newton_iter == 0)
1910                         Gnorm1_init = Gnorm1_new;
1911
1912                 if(Gnorm1_new <= eps*Gnorm1_init)
1913                         break;
1914
1915                 iter = 0;
1916                 QP_Gmax_old = INF;
1917                 QP_active_size = active_size;
1918
1919                 for(int i=0; i<l; i++)
1920                         xTd[i] = 0;
1921
1922                 // optimize QP over wpd
1923                 while(iter < max_iter)
1924                 {
1925                         QP_Gmax_new = 0;
1926                         QP_Gnorm1_new = 0;
1927
1928                         for(j=0; j<QP_active_size; j++)
1929                         {
1930                                 int i = j+rand()%(QP_active_size-j);
1931                                 swap(index[i], index[j]);
1932                         }
1933
1934                         for(s=0; s<QP_active_size; s++)
1935                         {
1936                                 j = index[s];
1937                                 H = Hdiag[j];
1938
1939                                 x = prob_col->x[j];
1940                                 G = Grad[j] + (wpd[j]-w[j])*nu;
1941                                 while(x->index != -1)
1942                                 {
1943                                         int ind = x->index-1;
1944                                         G += x->value*D[ind]*xTd[ind];
1945                                         x++;
1946                                 }
1947
1948                                 double violation = 0;
1949                                 if (j == w_size-1 && regularize_bias == 0)
1950                                 {
1951                                         // bias term not shrunken
1952                                         violation = fabs(G);
1953                                         z = -G/H;
1954                                 }
1955                                 else
1956                                 {
1957                                         double Gp = G+1;
1958                                         double Gn = G-1;
1959                                         if(wpd[j] == 0)
1960                                         {
1961                                                 if(Gp < 0)
1962                                                         violation = -Gp;
1963                                                 else if(Gn > 0)
1964                                                         violation = Gn;
1965                                                 //inner-level shrinking
1966                                                 else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
1967                                                 {
1968                                                         QP_active_size--;
1969                                                         swap(index[s], index[QP_active_size]);
1970                                                         s--;
1971                                                         continue;
1972                                                 }
1973                                         }
1974                                         else if(wpd[j] > 0)
1975                                                 violation = fabs(Gp);
1976                                         else
1977                                                 violation = fabs(Gn);
1978
1979                                         // obtain solution of one-variable problem
1980                                         if(Gp < H*wpd[j])
1981                                                 z = -Gp/H;
1982                                         else if(Gn > H*wpd[j])
1983                                                 z = -Gn/H;
1984                                         else
1985                                                 z = -wpd[j];
1986                                 }
1987                                 QP_Gmax_new = max(QP_Gmax_new, violation);
1988                                 QP_Gnorm1_new += violation;
1989
1990                                 if(fabs(z) < 1.0e-12)
1991                                         continue;
1992                                 z = min(max(z,-10.0),10.0);
1993
1994                                 wpd[j] += z;
1995
1996                                 x = prob_col->x[j];
1997                                 sparse_operator::axpy(z, x, xTd);
1998                         }
1999
2000                         iter++;
2001
2002                         if(QP_Gnorm1_new <= inner_eps*Gnorm1_init)
2003                         {
2004                                 //inner stopping
2005                                 if(QP_active_size == active_size)
2006                                         break;
2007                                 //active set reactivation
2008                                 else
2009                                 {
2010                                         QP_active_size = active_size;
2011                                         QP_Gmax_old = INF;
2012                                         continue;
2013                                 }
2014                         }
2015
2016                         QP_Gmax_old = QP_Gmax_new;
2017                 }
2018
2019                 if(iter >= max_iter)
2020                         info("WARNING: reaching max number of inner iterations\n");
2021
2022                 delta = 0;
2023                 w_norm_new = 0;
2024                 for(j=0; j<w_size; j++)
2025                 {
2026                         delta += Grad[j]*(wpd[j]-w[j]);
2027                         if(wpd[j] != 0)
2028                                 w_norm_new += fabs(wpd[j]);
2029                 }
2030                 if (regularize_bias == 0)
2031                         w_norm_new -= fabs(wpd[w_size-1]);
2032                 delta += (w_norm_new-w_norm);
2033
2034                 negsum_xTd = 0;
2035                 for(int i=0; i<l; i++)
2036                         if(y[i] == -1)
2037                                 negsum_xTd += C[GETI(i)]*xTd[i];
2038
2039                 int num_linesearch;
2040                 for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
2041                 {
2042                         cond = w_norm_new - w_norm + negsum_xTd - sigma*delta;
2043
2044                         for(int i=0; i<l; i++)
2045                         {
2046                                 double exp_xTd = exp(xTd[i]);
2047                                 exp_wTx_new[i] = exp_wTx[i]*exp_xTd;
2048                                 cond += C[GETI(i)]*log((1+exp_wTx_new[i])/(exp_xTd+exp_wTx_new[i]));
2049                         }
2050
2051                         if(cond <= 0)
2052                         {
2053                                 w_norm = w_norm_new;
2054                                 for(j=0; j<w_size; j++)
2055                                         w[j] = wpd[j];
2056                                 for(int i=0; i<l; i++)
2057                                 {
2058                                         exp_wTx[i] = exp_wTx_new[i];
2059                                         double tau_tmp = 1/(1+exp_wTx[i]);
2060                                         tau[i] = C[GETI(i)]*tau_tmp;
2061                                         D[i] = C[GETI(i)]*exp_wTx[i]*tau_tmp*tau_tmp;
2062                                 }
2063                                 break;
2064                         }
2065                         else
2066                         {
2067                                 w_norm_new = 0;
2068                                 for(j=0; j<w_size; j++)
2069                                 {
2070                                         wpd[j] = (w[j]+wpd[j])*0.5;
2071                                         if(wpd[j] != 0)
2072                                                 w_norm_new += fabs(wpd[j]);
2073                                 }
2074                                 if (regularize_bias == 0)
2075                                         w_norm_new -= fabs(wpd[w_size-1]);
2076                                 delta *= 0.5;
2077                                 negsum_xTd *= 0.5;
2078                                 for(int i=0; i<l; i++)
2079                                         xTd[i] *= 0.5;
2080                         }
2081                 }
2082
2083                 // Recompute some info due to too many line search steps
2084                 if(num_linesearch >= max_num_linesearch)
2085                 {
2086                         for(int i=0; i<l; i++)
2087                                 exp_wTx[i] = 0;
2088
2089                         for(int i=0; i<w_size; i++)
2090                         {
2091                                 if(w[i]==0) continue;
2092                                 x = prob_col->x[i];
2093                                 sparse_operator::axpy(w[i], x, exp_wTx);
2094                         }
2095
2096                         for(int i=0; i<l; i++)
2097                                 exp_wTx[i] = exp(exp_wTx[i]);
2098                 }
2099
2100                 if(iter == 1)
2101                         inner_eps *= 0.25;
2102
2103                 newton_iter++;
2104                 Gmax_old = Gmax_new;
2105
2106                 info("iter %3d  #CD cycles %d\n", newton_iter, iter);
2107         }
2108
2109         info("=========================\n");
2110         info("optimization finished, #iter = %d\n", newton_iter);
2111         if(newton_iter >= max_newton_iter)
2112                 info("WARNING: reaching max number of iterations\n");
2113
2114         // calculate objective value
2115
2116         double v = 0;
2117         int nnz = 0;
2118         for(j=0; j<w_size; j++)
2119                 if(w[j] != 0)
2120                 {
2121                         v += fabs(w[j]);
2122                         nnz++;
2123                 }
2124         if (regularize_bias == 0)
2125                 v -= fabs(w[w_size-1]);
2126         for(j=0; j<l; j++)
2127                 if(y[j] == 1)
2128                         v += C[GETI(j)]*log(1+1/exp_wTx[j]);
2129                 else
2130                         v += C[GETI(j)]*log(1+exp_wTx[j]);
2131
2132         info("Objective value = %lf\n", v);
2133         info("#nonzeros/#features = %d/%d\n", nnz, w_size);
2134
2135         delete [] index;
2136         delete [] y;
2137         delete [] Hdiag;
2138         delete [] Grad;
2139         delete [] wpd;
2140         delete [] xjneg_sum;
2141         delete [] xTd;
2142         delete [] exp_wTx;
2143         delete [] exp_wTx_new;
2144         delete [] tau;
2145         delete [] D;
2146 }
2147
2148 struct heap {
2149         enum HEAP_TYPE { MIN, MAX };
2150         int _size;
2151         HEAP_TYPE _type;
2152         feature_node* a;
2153
2154         heap(int max_size, HEAP_TYPE type)
2155         {
2156                 _size = 0;
2157                 a = new feature_node[max_size];
2158                 _type = type;
2159         }
2160         ~heap()
2161         {
2162                 delete [] a;
2163         }
2164         bool cmp(const feature_node& left, const feature_node& right)
2165         {
2166                 if(_type == MIN)
2167                         return left.value > right.value;
2168                 else
2169                         return left.value < right.value;
2170         }
2171         int size()
2172         {
2173                 return _size;
2174         }
2175         void push(feature_node node)
2176         {
2177                 a[_size] = node;
2178                 _size++;
2179                 int i = _size-1;
2180                 while(i)
2181                 {
2182                         int p = (i-1)/2;
2183                         if(cmp(a[p], a[i]))
2184                         {
2185                                 swap(a[i], a[p]);
2186                                 i = p;
2187                         }
2188                         else
2189                                 break;
2190                 }
2191         }
2192         void pop()
2193         {
2194                 _size--;
2195                 a[0] = a[_size];
2196                 int i = 0;
2197                 while(i*2+1 < _size)
2198                 {
2199                         int l = i*2+1;
2200                         int r = i*2+2;
2201                         if(r < _size && cmp(a[l], a[r]))
2202                                 l = r;
2203                         if(cmp(a[i], a[l]))
2204                         {
2205                                 swap(a[i], a[l]);
2206                                 i = l;
2207                         }
2208                         else
2209                                 break;
2210                 }
2211         }
2212         feature_node top()
2213         {
2214                 return a[0];
2215         }
2216 };
2217
2218 // A two-level coordinate descent algorithm for
2219 // a scaled one-class SVM dual problem
2220 //
2221 //  min_\alpha  0.5(\alpha^T Q \alpha),
2222 //    s.t.      0 <= \alpha_i <= 1 and
2223 //              e^T \alpha = \nu l
2224 //
2225 //  where Qij = xi^T xj
2226 //
2227 // Given:
2228 // x, nu
2229 // eps is the stopping tolerance
2230 //
2231 // solution will be put in w and rho
2232 //
2233 // See Algorithm 7 in supplementary materials of Chou et al., SDM 2020.
2234
2235 static void solve_oneclass_svm(const problem *prob, double *w, double *rho, double eps, double nu)
2236 {
2237         int l = prob->l;
2238         int w_size = prob->n;
2239         int i, j, s, iter = 0;
2240         double Gi, Gj;
2241         double Qij, quad_coef, delta, sum;
2242         double old_alpha_i;
2243         double *QD = new double[l];
2244         double *G = new double[l];
2245         int *index = new int[l];
2246         double *alpha = new double[l];
2247         int max_inner_iter;
2248         int max_iter = 1000;
2249         int active_size = l;
2250
2251         double negGmax;                 // max { -grad(f)_i | alpha_i < 1 }
2252         double negGmin;                 // min { -grad(f)_i | alpha_i > 0 }
2253
2254         int *most_violating_i = new int[l];
2255         int *most_violating_j = new int[l];
2256
2257         int n = (int)(nu*l);            // # of alpha's at upper bound
2258         for(i=0; i<n; i++)
2259                 alpha[i] = 1;
2260         if (n<l)
2261                 alpha[i] = nu*l-n;
2262         for(i=n+1; i<l; i++)
2263                 alpha[i] = 0;
2264
2265         for(i=0; i<w_size; i++)
2266                 w[i] = 0;
2267         for(i=0; i<l; i++)
2268         {
2269                 feature_node * const xi = prob->x[i];
2270                 QD[i] = sparse_operator::nrm2_sq(xi);
2271                 sparse_operator::axpy(alpha[i], xi, w);
2272
2273                 index[i] = i;
2274         }
2275
2276         while (iter < max_iter)
2277         {
2278                 negGmax = -INF;
2279                 negGmin = INF;
2280
2281                 for (s=0; s<active_size; s++)
2282                 {
2283                         i = index[s];
2284                         feature_node * const xi = prob->x[i];
2285                         G[i] = sparse_operator::dot(w, xi);
2286                         if (alpha[i] < 1)
2287                                 negGmax = max(negGmax, -G[i]);
2288                         if (alpha[i] > 0)
2289                                 negGmin = min(negGmin, -G[i]);
2290                 }
2291
2292                 if (negGmax - negGmin < eps)
2293                 {
2294                         if (active_size == l)
2295                                 break;
2296                         else
2297                         {
2298                                 active_size = l;
2299                                 info("*");
2300                                 continue;
2301                         }
2302                 }
2303
2304                 for(s=0; s<active_size; s++)
2305                 {
2306                         i = index[s];
2307                         if ((alpha[i] == 1 && -G[i] > negGmax) ||
2308                             (alpha[i] == 0 && -G[i] < negGmin))
2309                         {
2310                                 active_size--;
2311                                 swap(index[s], index[active_size]);
2312                                 s--;
2313                         }
2314                 }
2315
2316                 max_inner_iter = max(active_size/10, 1);
2317                 struct heap min_heap = heap(max_inner_iter, heap::MIN);
2318                 struct heap max_heap = heap(max_inner_iter, heap::MAX);
2319                 struct feature_node node;
2320                 for(s=0; s<active_size; s++)
2321                 {
2322                         i = index[s];
2323                         node.index = i;
2324                         node.value = -G[i];
2325
2326                         if (alpha[i] < 1)
2327                         {
2328                                 if (min_heap.size() < max_inner_iter)
2329                                         min_heap.push(node);
2330                                 else if (min_heap.top().value < node.value)
2331                                 {
2332                                         min_heap.pop();
2333                                         min_heap.push(node);
2334                                 }
2335                         }
2336
2337                         if (alpha[i] > 0)
2338                         {
2339                                 if (max_heap.size() < max_inner_iter)
2340                                         max_heap.push(node);
2341                                 else if (max_heap.top().value > node.value)
2342                                 {
2343                                         max_heap.pop();
2344                                         max_heap.push(node);
2345                                 }
2346                         }
2347                 }
2348                 max_inner_iter = min(min_heap.size(), max_heap.size());
2349                 while (max_heap.size() > max_inner_iter)
2350                         max_heap.pop();
2351                 while (min_heap.size() > max_inner_iter)
2352                         min_heap.pop();
2353
2354                 for (s=max_inner_iter-1; s>=0; s--)
2355                 {
2356                         most_violating_i[s] = min_heap.top().index;
2357                         most_violating_j[s] = max_heap.top().index;
2358                         min_heap.pop();
2359                         max_heap.pop();
2360                 }
2361
2362                 for (s=0; s<max_inner_iter; s++)
2363                 {
2364                         i = most_violating_i[s];
2365                         j = most_violating_j[s];
2366
2367                         if ((alpha[i] == 0 && alpha[j] == 0) ||
2368                             (alpha[i] == 1 && alpha[j] == 1))
2369                                 continue;
2370
2371                         feature_node const * xi = prob->x[i];
2372                         feature_node const * xj = prob->x[j];
2373
2374                         Gi = sparse_operator::dot(w, xi);
2375                         Gj = sparse_operator::dot(w, xj);
2376
2377                         int violating_pair = 0;
2378                         if (alpha[i] < 1 && alpha[j] > 0 && -Gj + 1e-12 < -Gi)
2379                                 violating_pair = 1;
2380                         else
2381                                 if (alpha[i] > 0 && alpha[j] < 1 && -Gi + 1e-12 < -Gj)
2382                                         violating_pair = 1;
2383                         if (violating_pair == 0)
2384                                 continue;
2385
2386                         Qij = sparse_operator::sparse_dot(xi, xj);
2387                         quad_coef = QD[i] + QD[j] - 2*Qij;
2388                         if(quad_coef <= 0)
2389                                 quad_coef = 1e-12;
2390                         delta = (Gi - Gj) / quad_coef;
2391                         old_alpha_i = alpha[i];
2392                         sum = alpha[i] + alpha[j];
2393                         alpha[i] = alpha[i] - delta;
2394                         alpha[j] = alpha[j] + delta;
2395                         if (sum > 1)
2396                         {
2397                                 if (alpha[i] > 1)
2398                                 {
2399                                         alpha[i] = 1;
2400                                         alpha[j] = sum - 1;
2401                                 }
2402                         }
2403                         else
2404                         {
2405                                 if (alpha[j] < 0)
2406                                 {
2407                                         alpha[j] = 0;
2408                                         alpha[i] = sum;
2409                                 }
2410                         }
2411                         if (sum > 1)
2412                         {
2413                                 if (alpha[j] > 1)
2414                                 {
2415                                         alpha[j] = 1;
2416                                         alpha[i] = sum - 1;
2417                                 }
2418                         }
2419                         else
2420                         {
2421                                 if (alpha[i] < 0)
2422                                 {
2423                                         alpha[i] = 0;
2424                                         alpha[j] = sum;
2425                                 }
2426                         }
2427                         delta = alpha[i] - old_alpha_i;
2428                         sparse_operator::axpy(delta, xi, w);
2429                         sparse_operator::axpy(-delta, xj, w);
2430                 }
2431                 iter++;
2432                 if (iter % 10 == 0)
2433                         info(".");
2434         }
2435         info("\noptimization finished, #iter = %d\n",iter);
2436         if (iter >= max_iter)
2437                 info("\nWARNING: reaching max number of iterations\n\n");
2438
2439         // calculate object value
2440         double v = 0;
2441         for(i=0; i<w_size; i++)
2442                 v += w[i]*w[i];
2443         int nSV = 0;
2444         for(i=0; i<l; i++)
2445         {
2446                 if (alpha[i] > 0)
2447                         ++nSV;
2448         }
2449         info("Objective value = %lf\n", v/2);
2450         info("nSV = %d\n", nSV);
2451
2452         // calculate rho
2453         double nr_free = 0;
2454         double ub = INF, lb = -INF, sum_free = 0;
2455         for(i=0; i<l; i++)
2456         {
2457                 double G = sparse_operator::dot(w, prob->x[i]);
2458                 if (alpha[i] == 0)
2459                         lb = max(lb, G);
2460                 else if (alpha[i] == 1)
2461                         ub = min(ub, G);
2462                 else
2463                 {
2464                         ++nr_free;
2465                         sum_free += G;
2466                 }
2467         }
2468
2469         if (nr_free > 0)
2470                 *rho = sum_free/nr_free;
2471         else
2472                 *rho = (ub + lb)/2;
2473
2474         info("rho = %lf\n", *rho);
2475
2476         delete [] QD;
2477         delete [] G;
2478         delete [] index;
2479         delete [] alpha;
2480         delete [] most_violating_i;
2481         delete [] most_violating_j;
2482 }
2483
2484 // transpose matrix X from row format to column format
2485 static void transpose(const problem *prob, feature_node **x_space_ret, problem *prob_col)
2486 {
2487         int i;
2488         int l = prob->l;
2489         int n = prob->n;
2490         size_t nnz = 0;
2491         size_t *col_ptr = new size_t [n+1];
2492         feature_node *x_space;
2493         prob_col->l = l;
2494         prob_col->n = n;
2495         prob_col->y = new double[l];
2496         prob_col->x = new feature_node*[n];
2497
2498         for(i=0; i<l; i++)
2499                 prob_col->y[i] = prob->y[i];
2500
2501         for(i=0; i<n+1; i++)
2502                 col_ptr[i] = 0;
2503         for(i=0; i<l; i++)
2504         {
2505                 feature_node *x = prob->x[i];
2506                 while(x->index != -1)
2507                 {
2508                         nnz++;
2509                         col_ptr[x->index]++;
2510                         x++;
2511                 }
2512         }
2513         for(i=1; i<n+1; i++)
2514                 col_ptr[i] += col_ptr[i-1] + 1;
2515
2516         x_space = new feature_node[nnz+n];
2517         for(i=0; i<n; i++)
2518                 prob_col->x[i] = &x_space[col_ptr[i]];
2519
2520         for(i=0; i<l; i++)
2521         {
2522                 feature_node *x = prob->x[i];
2523                 while(x->index != -1)
2524                 {
2525                         int ind = x->index-1;
2526                         x_space[col_ptr[ind]].index = i+1; // starts from 1
2527                         x_space[col_ptr[ind]].value = x->value;
2528                         col_ptr[ind]++;
2529                         x++;
2530                 }
2531         }
2532         for(i=0; i<n; i++)
2533                 x_space[col_ptr[i]].index = -1;
2534
2535         *x_space_ret = x_space;
2536
2537         delete [] col_ptr;
2538 }
2539
2540 // label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
2541 // perm, length l, must be allocated before calling this subroutine
2542 static void group_classes(const problem *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
2543 {
2544         int l = prob->l;
2545         int max_nr_class = 16;
2546         int nr_class = 0;
2547         int *label = Malloc(int,max_nr_class);
2548         int *count = Malloc(int,max_nr_class);
2549         int *data_label = Malloc(int,l);
2550         int i;
2551
2552         for(i=0;i<l;i++)
2553         {
2554                 int this_label = (int)prob->y[i];
2555                 int j;
2556                 for(j=0;j<nr_class;j++)
2557                 {
2558                         if(this_label == label[j])
2559                         {
2560                                 ++count[j];
2561                                 break;
2562                         }
2563                 }
2564                 data_label[i] = j;
2565                 if(j == nr_class)
2566                 {
2567                         if(nr_class == max_nr_class)
2568                         {
2569                                 max_nr_class *= 2;
2570                                 label = (int *)realloc(label,max_nr_class*sizeof(int));
2571                                 count = (int *)realloc(count,max_nr_class*sizeof(int));
2572                         }
2573                         label[nr_class] = this_label;
2574                         count[nr_class] = 1;
2575                         ++nr_class;
2576                 }
2577         }
2578
2579         //
2580         // Labels are ordered by their first occurrence in the training set.
2581         // However, for two-class sets with -1/+1 labels and -1 appears first,
2582         // we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances.
2583         //
2584         if (nr_class == 2 && label[0] == -1 && label[1] == 1)
2585         {
2586                 swap(label[0],label[1]);
2587                 swap(count[0],count[1]);
2588                 for(i=0;i<l;i++)
2589                 {
2590                         if(data_label[i] == 0)
2591                                 data_label[i] = 1;
2592                         else
2593                                 data_label[i] = 0;
2594                 }
2595         }
2596
2597         int *start = Malloc(int,nr_class);
2598         start[0] = 0;
2599         for(i=1;i<nr_class;i++)
2600                 start[i] = start[i-1]+count[i-1];
2601         for(i=0;i<l;i++)
2602         {
2603                 perm[start[data_label[i]]] = i;
2604                 ++start[data_label[i]];
2605         }
2606         start[0] = 0;
2607         for(i=1;i<nr_class;i++)
2608                 start[i] = start[i-1]+count[i-1];
2609
2610         *nr_class_ret = nr_class;
2611         *label_ret = label;
2612         *start_ret = start;
2613         *count_ret = count;
2614         free(data_label);
2615 }
2616
2617 static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
2618 {
2619         double eps = param->eps;
2620
2621         int pos = 0;
2622         int neg = 0;
2623         for(int i=0;i<prob->l;i++)
2624                 if(prob->y[i] > 0)
2625                         pos++;
2626         neg = prob->l - pos;
2627         double primal_solver_tol = eps*max(min(pos,neg), 1)/prob->l;
2628
2629         function *fun_obj=NULL;
2630         switch(param->solver_type)
2631         {
2632                 case L2R_LR:
2633                 {
2634                         double *C = new double[prob->l];
2635                         for(int i = 0; i < prob->l; i++)
2636                         {
2637                                 if(prob->y[i] > 0)
2638                                         C[i] = Cp;
2639                                 else
2640                                         C[i] = Cn;
2641                         }
2642                         fun_obj=new l2r_lr_fun(prob, param, C);
2643                         NEWTON newton_obj(fun_obj, primal_solver_tol);
2644                         newton_obj.set_print_string(liblinear_print_string);
2645                         newton_obj.newton(w);
2646                         delete fun_obj;
2647                         delete[] C;
2648                         break;
2649                 }
2650                 case L2R_L2LOSS_SVC:
2651                 {
2652                         double *C = new double[prob->l];
2653                         for(int i = 0; i < prob->l; i++)
2654                         {
2655                                 if(prob->y[i] > 0)
2656                                         C[i] = Cp;
2657                                 else
2658                                         C[i] = Cn;
2659                         }
2660                         fun_obj=new l2r_l2_svc_fun(prob, param, C);
2661                         NEWTON newton_obj(fun_obj, primal_solver_tol);
2662                         newton_obj.set_print_string(liblinear_print_string);
2663                         newton_obj.newton(w);
2664                         delete fun_obj;
2665                         delete[] C;
2666                         break;
2667                 }
2668                 case L2R_L2LOSS_SVC_DUAL:
2669                         solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L2LOSS_SVC_DUAL);
2670                         break;
2671                 case L2R_L1LOSS_SVC_DUAL:
2672                         solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L1LOSS_SVC_DUAL);
2673                         break;
2674                 case L1R_L2LOSS_SVC:
2675                 {
2676                         problem prob_col;
2677                         feature_node *x_space = NULL;
2678                         transpose(prob, &x_space ,&prob_col);
2679                         solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn, param->regularize_bias);
2680                         delete [] prob_col.y;
2681                         delete [] prob_col.x;
2682                         delete [] x_space;
2683                         break;
2684                 }
2685                 case L1R_LR:
2686                 {
2687                         problem prob_col;
2688                         feature_node *x_space = NULL;
2689                         transpose(prob, &x_space ,&prob_col);
2690                         solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn, param->regularize_bias);
2691                         delete [] prob_col.y;
2692                         delete [] prob_col.x;
2693                         delete [] x_space;
2694                         break;
2695                 }
2696                 case L2R_LR_DUAL:
2697                         solve_l2r_lr_dual(prob, w, eps, Cp, Cn);
2698                         break;
2699                 case L2R_L2LOSS_SVR:
2700                 {
2701                         double *C = new double[prob->l];
2702                         for(int i = 0; i < prob->l; i++)
2703                                 C[i] = param->C;
2704
2705                         fun_obj=new l2r_l2_svr_fun(prob, param, C);
2706                         NEWTON newton_obj(fun_obj, param->eps);
2707                         newton_obj.set_print_string(liblinear_print_string);
2708                         newton_obj.newton(w);
2709                         delete fun_obj;
2710                         delete[] C;
2711                         break;
2712
2713                 }
2714                 case L2R_L1LOSS_SVR_DUAL:
2715                         solve_l2r_l1l2_svr(prob, w, param, L2R_L1LOSS_SVR_DUAL);
2716                         break;
2717                 case L2R_L2LOSS_SVR_DUAL:
2718                         solve_l2r_l1l2_svr(prob, w, param, L2R_L2LOSS_SVR_DUAL);
2719                         break;
2720                 default:
2721                         fprintf(stderr, "ERROR: unknown solver_type\n");
2722                         break;
2723         }
2724 }
2725
2726 // Calculate the initial C for parameter selection
2727 static double calc_start_C(const problem *prob, const parameter *param)
2728 {
2729         int i;
2730         double xTx, max_xTx;
2731         max_xTx = 0;
2732         for(i=0; i<prob->l; i++)
2733         {
2734                 xTx = 0;
2735                 feature_node *xi=prob->x[i];
2736                 while(xi->index != -1)
2737                 {
2738                         double val = xi->value;
2739                         xTx += val*val;
2740                         xi++;
2741                 }
2742                 if(xTx > max_xTx)
2743                         max_xTx = xTx;
2744         }
2745
2746         double min_C = 1.0;
2747         if(param->solver_type == L2R_LR)
2748                 min_C = 1.0 / (prob->l * max_xTx);
2749         else if(param->solver_type == L2R_L2LOSS_SVC)
2750                 min_C = 1.0 / (2 * prob->l * max_xTx);
2751         else if(param->solver_type == L2R_L2LOSS_SVR)
2752         {
2753                 double sum_y, loss, y_abs;
2754                 double delta2 = 0.1;
2755                 sum_y = 0, loss = 0;
2756                 for(i=0; i<prob->l; i++)
2757                 {
2758                         y_abs = fabs(prob->y[i]);
2759                         sum_y += y_abs;
2760                         loss += max(y_abs - param->p, 0.0) * max(y_abs - param->p, 0.0);
2761                 }
2762                 if(loss > 0)
2763                         min_C = delta2 * delta2 * loss / (8 * sum_y * sum_y * max_xTx);
2764                 else
2765                         min_C = INF;
2766         }
2767
2768         return pow( 2, floor(log(min_C) / log(2.0)) );
2769 }
2770
2771 static double calc_max_p(const problem *prob, const parameter *param)
2772 {
2773         int i;
2774         double max_p = 0.0;
2775         for(i = 0; i < prob->l; i++)
2776                 max_p = max(max_p, fabs(prob->y[i]));
2777
2778         return max_p;
2779 }
2780
2781 static void find_parameter_C(const problem *prob, parameter *param_tmp, double start_C, double max_C, double *best_C, double *best_score, const int *fold_start, const int *perm, const problem *subprob, int nr_fold)
2782 {
2783         // variables for CV
2784         int i;
2785         double *target = Malloc(double, prob->l);
2786
2787         // variables for warm start
2788         double ratio = 2;
2789         double **prev_w = Malloc(double*, nr_fold);
2790         for(i = 0; i < nr_fold; i++)
2791                 prev_w[i] = NULL;
2792         int num_unchanged_w = 0;
2793         void (*default_print_string) (const char *) = liblinear_print_string;
2794
2795         if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2796                 *best_score = 0.0;
2797         else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2798                 *best_score = INF;
2799         *best_C = start_C;
2800
2801         param_tmp->C = start_C;
2802         while(param_tmp->C <= max_C)
2803         {
2804                 //Output disabled for running CV at a particular C
2805                 set_print_string_function(&print_null);
2806
2807                 for(i=0; i<nr_fold; i++)
2808                 {
2809                         int j;
2810                         int begin = fold_start[i];
2811                         int end = fold_start[i+1];
2812
2813                         param_tmp->init_sol = prev_w[i];
2814                         struct model *submodel = train(&subprob[i],param_tmp);
2815
2816                         int total_w_size;
2817                         if(submodel->nr_class == 2)
2818                                 total_w_size = subprob[i].n;
2819                         else
2820                                 total_w_size = subprob[i].n * submodel->nr_class;
2821
2822                         if(prev_w[i] == NULL)
2823                         {
2824                                 prev_w[i] = Malloc(double, total_w_size);
2825                                 for(j=0; j<total_w_size; j++)
2826                                         prev_w[i][j] = submodel->w[j];
2827                         }
2828                         else if(num_unchanged_w >= 0)
2829                         {
2830                                 double norm_w_diff = 0;
2831                                 for(j=0; j<total_w_size; j++)
2832                                 {
2833                                         norm_w_diff += (submodel->w[j] - prev_w[i][j])*(submodel->w[j] - prev_w[i][j]);
2834                                         prev_w[i][j] = submodel->w[j];
2835                                 }
2836                                 norm_w_diff = sqrt(norm_w_diff);
2837
2838                                 if(norm_w_diff > 1e-15)
2839                                         num_unchanged_w = -1;
2840                         }
2841                         else
2842                         {
2843                                 for(j=0; j<total_w_size; j++)
2844                                         prev_w[i][j] = submodel->w[j];
2845                         }
2846
2847                         for(j=begin; j<end; j++)
2848                                 target[perm[j]] = predict(submodel,prob->x[perm[j]]);
2849
2850                         free_and_destroy_model(&submodel);
2851                 }
2852                 set_print_string_function(default_print_string);
2853
2854                 if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2855                 {
2856                         int total_correct = 0;
2857                         for(i=0; i<prob->l; i++)
2858                                 if(target[i] == prob->y[i])
2859                                         ++total_correct;
2860                         double current_rate = (double)total_correct/prob->l;
2861                         if(current_rate > *best_score)
2862                         {
2863                                 *best_C = param_tmp->C;
2864                                 *best_score = current_rate;
2865                         }
2866
2867                         info("log2c=%7.2f\trate=%g\n",log(param_tmp->C)/log(2.0),100.0*current_rate);
2868                 }
2869                 else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2870                 {
2871                         double total_error = 0.0;
2872                         for(i=0; i<prob->l; i++)
2873                         {
2874                                 double y = prob->y[i];
2875                                 double v = target[i];
2876                                 total_error += (v-y)*(v-y);
2877                         }
2878                         double current_error = total_error/prob->l;
2879                         if(current_error < *best_score)
2880                         {
2881                                 *best_C = param_tmp->C;
2882                                 *best_score = current_error;
2883                         }
2884
2885                         info("log2c=%7.2f\tp=%7.2f\tMean squared error=%g\n",log(param_tmp->C)/log(2.0),param_tmp->p,current_error);
2886                 }
2887
2888                 num_unchanged_w++;
2889                 if(num_unchanged_w == 5)
2890                         break;
2891                 param_tmp->C = param_tmp->C*ratio;
2892         }
2893
2894         if(param_tmp->C > max_C)
2895                 info("WARNING: maximum C reached.\n");
2896         free(target);
2897         for(i=0; i<nr_fold; i++)
2898                 free(prev_w[i]);
2899         free(prev_w);
2900 }
2901
2902
2903 //
2904 // Interface functions
2905 //
2906 model* train(const problem *prob, const parameter *param)
2907 {
2908         int i,j;
2909         int l = prob->l;
2910         int n = prob->n;
2911         int w_size = prob->n;
2912         model *model_ = Malloc(model,1);
2913
2914         if(prob->bias>=0)
2915                 model_->nr_feature=n-1;
2916         else
2917                 model_->nr_feature=n;
2918         model_->param = *param;
2919         model_->bias = prob->bias;
2920
2921         if(check_regression_model(model_))
2922         {
2923                 model_->w = Malloc(double, w_size);
2924
2925                 if(param->init_sol != NULL)
2926                         for(i=0;i<w_size;i++)
2927                                 model_->w[i] = param->init_sol[i];
2928                 else
2929                         for(i=0;i<w_size;i++)
2930                                 model_->w[i] = 0;
2931
2932                 model_->nr_class = 2;
2933                 model_->label = NULL;
2934                 train_one(prob, param, model_->w, 0, 0);
2935         }
2936         else if(check_oneclass_model(model_))
2937         {
2938                 model_->w = Malloc(double, w_size);
2939                 model_->nr_class = 2;
2940                 model_->label = NULL;
2941                 solve_oneclass_svm(prob, model_->w, &(model_->rho), param->eps, param->nu);
2942         }
2943         else
2944         {
2945                 int nr_class;
2946                 int *label = NULL;
2947                 int *start = NULL;
2948                 int *count = NULL;
2949                 int *perm = Malloc(int,l);
2950
2951                 // group training data of the same class
2952                 group_classes(prob,&nr_class,&label,&start,&count,perm);
2953
2954                 model_->nr_class=nr_class;
2955                 model_->label = Malloc(int,nr_class);
2956                 for(i=0;i<nr_class;i++)
2957                         model_->label[i] = label[i];
2958
2959                 // calculate weighted C
2960                 double *weighted_C = Malloc(double, nr_class);
2961                 for(i=0;i<nr_class;i++)
2962                         weighted_C[i] = param->C;
2963                 for(i=0;i<param->nr_weight;i++)
2964                 {
2965                         for(j=0;j<nr_class;j++)
2966                                 if(param->weight_label[i] == label[j])
2967                                         break;
2968                         if(j == nr_class)
2969                                 fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
2970                         else
2971                                 weighted_C[j] *= param->weight[i];
2972                 }
2973
2974                 // constructing the subproblem
2975                 feature_node **x = Malloc(feature_node *,l);
2976                 for(i=0;i<l;i++)
2977                         x[i] = prob->x[perm[i]];
2978
2979                 int k;
2980                 problem sub_prob;
2981                 sub_prob.l = l;
2982                 sub_prob.n = n;
2983                 sub_prob.x = Malloc(feature_node *,sub_prob.l);
2984                 sub_prob.y = Malloc(double,sub_prob.l);
2985
2986                 for(k=0; k<sub_prob.l; k++)
2987                         sub_prob.x[k] = x[k];
2988
2989                 // multi-class svm by Crammer and Singer
2990                 if(param->solver_type == MCSVM_CS)
2991                 {
2992                         model_->w=Malloc(double, n*nr_class);
2993                         for(i=0;i<nr_class;i++)
2994                                 for(j=start[i];j<start[i]+count[i];j++)
2995                                         sub_prob.y[j] = i;
2996                         Solver_MCSVM_CS Solver(&sub_prob, nr_class, weighted_C, param->eps);
2997                         Solver.Solve(model_->w);
2998                 }
2999                 else
3000                 {
3001                         if(nr_class == 2)
3002                         {
3003                                 model_->w=Malloc(double, w_size);
3004
3005                                 int e0 = start[0]+count[0];
3006                                 k=0;
3007                                 for(; k<e0; k++)
3008                                         sub_prob.y[k] = +1;
3009                                 for(; k<sub_prob.l; k++)
3010                                         sub_prob.y[k] = -1;
3011
3012                                 if(param->init_sol != NULL)
3013                                         for(i=0;i<w_size;i++)
3014                                                 model_->w[i] = param->init_sol[i];
3015                                 else
3016                                         for(i=0;i<w_size;i++)
3017                                                 model_->w[i] = 0;
3018
3019                                 train_one(&sub_prob, param, model_->w, weighted_C[0], weighted_C[1]);
3020                         }
3021                         else
3022                         {
3023                                 model_->w=Malloc(double, w_size*nr_class);
3024                                 double *w=Malloc(double, w_size);
3025                                 for(i=0;i<nr_class;i++)
3026                                 {
3027                                         int si = start[i];
3028                                         int ei = si+count[i];
3029
3030                                         k=0;
3031                                         for(; k<si; k++)
3032                                                 sub_prob.y[k] = -1;
3033                                         for(; k<ei; k++)
3034                                                 sub_prob.y[k] = +1;
3035                                         for(; k<sub_prob.l; k++)
3036                                                 sub_prob.y[k] = -1;
3037
3038                                         if(param->init_sol != NULL)
3039                                                 for(j=0;j<w_size;j++)
3040                                                         w[j] = param->init_sol[j*nr_class+i];
3041                                         else
3042                                                 for(j=0;j<w_size;j++)
3043                                                         w[j] = 0;
3044
3045                                         train_one(&sub_prob, param, w, weighted_C[i], param->C);
3046
3047                                         for(j=0;j<w_size;j++)
3048                                                 model_->w[j*nr_class+i] = w[j];
3049                                 }
3050                                 free(w);
3051                         }
3052
3053                 }
3054
3055                 free(x);
3056                 free(label);
3057                 free(start);
3058                 free(count);
3059                 free(perm);
3060                 free(sub_prob.x);
3061                 free(sub_prob.y);
3062                 free(weighted_C);
3063         }
3064         return model_;
3065 }
3066
3067 void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target)
3068 {
3069         int i;
3070         int *fold_start;
3071         int l = prob->l;
3072         int *perm = Malloc(int,l);
3073         if (nr_fold > l)
3074         {
3075                 nr_fold = l;
3076                 fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3077         }
3078         fold_start = Malloc(int,nr_fold+1);
3079         for(i=0;i<l;i++) perm[i]=i;
3080         for(i=0;i<l;i++)
3081         {
3082                 int j = i+rand()%(l-i);
3083                 swap(perm[i],perm[j]);
3084         }
3085         for(i=0;i<=nr_fold;i++)
3086                 fold_start[i]=i*l/nr_fold;
3087
3088         for(i=0;i<nr_fold;i++)
3089         {
3090                 int begin = fold_start[i];
3091                 int end = fold_start[i+1];
3092                 int j,k;
3093                 struct problem subprob;
3094
3095                 subprob.bias = prob->bias;
3096                 subprob.n = prob->n;
3097                 subprob.l = l-(end-begin);
3098                 subprob.x = Malloc(struct feature_node*,subprob.l);
3099                 subprob.y = Malloc(double,subprob.l);
3100
3101                 k=0;
3102                 for(j=0;j<begin;j++)
3103                 {
3104                         subprob.x[k] = prob->x[perm[j]];
3105                         subprob.y[k] = prob->y[perm[j]];
3106                         ++k;
3107                 }
3108                 for(j=end;j<l;j++)
3109                 {
3110                         subprob.x[k] = prob->x[perm[j]];
3111                         subprob.y[k] = prob->y[perm[j]];
3112                         ++k;
3113                 }
3114                 struct model *submodel = train(&subprob,param);
3115                 for(j=begin;j<end;j++)
3116                         target[perm[j]] = predict(submodel,prob->x[perm[j]]);
3117                 free_and_destroy_model(&submodel);
3118                 free(subprob.x);
3119                 free(subprob.y);
3120         }
3121         free(fold_start);
3122         free(perm);
3123 }
3124
3125
3126 void find_parameters(const problem *prob, const parameter *param, int nr_fold, double start_C, double start_p, double *best_C, double *best_p, double *best_score)
3127 {
3128         // prepare CV folds
3129
3130         int i;
3131         int *fold_start;
3132         int l = prob->l;
3133         int *perm = Malloc(int, l);
3134         struct problem *subprob = Malloc(problem,nr_fold);
3135
3136         if (nr_fold > l)
3137         {
3138                 nr_fold = l;
3139                 fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3140         }
3141         fold_start = Malloc(int,nr_fold+1);
3142         for(i=0;i<l;i++) perm[i]=i;
3143         for(i=0;i<l;i++)
3144         {
3145                 int j = i+rand()%(l-i);
3146                 swap(perm[i],perm[j]);
3147         }
3148         for(i=0;i<=nr_fold;i++)
3149                 fold_start[i]=i*l/nr_fold;
3150
3151         for(i=0;i<nr_fold;i++)
3152         {
3153                 int begin = fold_start[i];
3154                 int end = fold_start[i+1];
3155                 int j,k;
3156
3157                 subprob[i].bias = prob->bias;
3158                 subprob[i].n = prob->n;
3159                 subprob[i].l = l-(end-begin);
3160                 subprob[i].x = Malloc(struct feature_node*,subprob[i].l);
3161                 subprob[i].y = Malloc(double,subprob[i].l);
3162
3163                 k=0;
3164                 for(j=0;j<begin;j++)
3165                 {
3166                         subprob[i].x[k] = prob->x[perm[j]];
3167                         subprob[i].y[k] = prob->y[perm[j]];
3168                         ++k;
3169                 }
3170                 for(j=end;j<l;j++)
3171                 {
3172                         subprob[i].x[k] = prob->x[perm[j]];
3173                         subprob[i].y[k] = prob->y[perm[j]];
3174                         ++k;
3175                 }
3176
3177         }
3178
3179         struct parameter param_tmp = *param;
3180         *best_p = -1;
3181         if(param->solver_type == L2R_LR || param->solver_type == L2R_L2LOSS_SVC)
3182         {
3183                 if(start_C <= 0)
3184                         start_C = calc_start_C(prob, &param_tmp);
3185                 double max_C = 1024;
3186                 start_C = min(start_C, max_C);
3187                 double best_C_tmp, best_score_tmp;
3188
3189                 find_parameter_C(prob, &param_tmp, start_C, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3190
3191                 *best_C = best_C_tmp;
3192                 *best_score = best_score_tmp;
3193         }
3194         else if(param->solver_type == L2R_L2LOSS_SVR)
3195         {
3196                 double max_p = calc_max_p(prob, &param_tmp);
3197                 int num_p_steps = 20;
3198                 double max_C = 1048576;
3199                 *best_score = INF;
3200
3201                 i = num_p_steps-1;
3202                 if(start_p > 0)
3203                         i = min((int)(start_p/(max_p/num_p_steps)), i);
3204                 for(; i >= 0; i--)
3205                 {
3206                         param_tmp.p = i*max_p/num_p_steps;
3207                         double start_C_tmp;
3208                         if(start_C <= 0)
3209                                 start_C_tmp = calc_start_C(prob, &param_tmp);
3210                         else
3211                                 start_C_tmp = start_C;
3212                         start_C_tmp = min(start_C_tmp, max_C);
3213                         double best_C_tmp, best_score_tmp;
3214
3215                         find_parameter_C(prob, &param_tmp, start_C_tmp, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3216
3217                         if(best_score_tmp < *best_score)
3218                         {
3219                                 *best_p = param_tmp.p;
3220                                 *best_C = best_C_tmp;
3221                                 *best_score = best_score_tmp;
3222                         }
3223                 }
3224         }
3225
3226         free(fold_start);
3227         free(perm);
3228         for(i=0; i<nr_fold; i++)
3229         {
3230                 free(subprob[i].x);
3231                 free(subprob[i].y);
3232         }
3233         free(subprob);
3234 }
3235
3236 double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values)
3237 {
3238         int idx;
3239         int n;
3240         if(model_->bias>=0)
3241                 n=model_->nr_feature+1;
3242         else
3243                 n=model_->nr_feature;
3244         double *w=model_->w;
3245         int nr_class=model_->nr_class;
3246         int i;
3247         int nr_w;
3248         if(nr_class==2 && model_->param.solver_type != MCSVM_CS)
3249                 nr_w = 1;
3250         else
3251                 nr_w = nr_class;
3252
3253         const feature_node *lx=x;
3254         for(i=0;i<nr_w;i++)
3255                 dec_values[i] = 0;
3256         for(; (idx=lx->index)!=-1; lx++)
3257         {
3258                 // the dimension of testing data may exceed that of training
3259                 if(idx<=n)
3260                         for(i=0;i<nr_w;i++)
3261                                 dec_values[i] += w[(idx-1)*nr_w+i]*lx->value;
3262         }
3263         if(check_oneclass_model(model_))
3264                 dec_values[0] -= model_->rho;
3265
3266         if(nr_class==2)
3267         {
3268                 if(check_regression_model(model_))
3269                         return dec_values[0];
3270                 else if(check_oneclass_model(model_))
3271                         return (dec_values[0]>0)?1:-1;
3272                 else
3273                         return (dec_values[0]>0)?model_->label[0]:model_->label[1];
3274         }
3275         else
3276         {
3277                 int dec_max_idx = 0;
3278                 for(i=1;i<nr_class;i++)
3279                 {
3280                         if(dec_values[i] > dec_values[dec_max_idx])
3281                                 dec_max_idx = i;
3282                 }
3283                 return model_->label[dec_max_idx];
3284         }
3285 }
3286
3287 double predict(const model *model_, const feature_node *x)
3288 {
3289         double *dec_values = Malloc(double, model_->nr_class);
3290         double label=predict_values(model_, x, dec_values);
3291         free(dec_values);
3292         return label;
3293 }
3294
3295 double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates)
3296 {
3297         if(check_probability_model(model_))
3298         {
3299                 int i;
3300                 int nr_class=model_->nr_class;
3301                 int nr_w;
3302                 if(nr_class==2)
3303                         nr_w = 1;
3304                 else
3305                         nr_w = nr_class;
3306
3307                 double label=predict_values(model_, x, prob_estimates);
3308                 for(i=0;i<nr_w;i++)
3309                         prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
3310
3311                 if(nr_class==2) // for binary classification
3312                         prob_estimates[1]=1.-prob_estimates[0];
3313                 else
3314                 {
3315                         double sum=0;
3316                         for(i=0; i<nr_class; i++)
3317                                 sum+=prob_estimates[i];
3318
3319                         for(i=0; i<nr_class; i++)
3320                                 prob_estimates[i]=prob_estimates[i]/sum;
3321                 }
3322
3323                 return label;
3324         }
3325         else
3326                 return 0;
3327 }
3328
3329 static const char *solver_type_table[]=
3330 {
3331         "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
3332         "L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL",
3333         "", "", "",
3334         "L2R_L2LOSS_SVR", "L2R_L2LOSS_SVR_DUAL", "L2R_L1LOSS_SVR_DUAL",
3335         "", "", "", "", "", "", "",
3336         "ONECLASS_SVM", NULL
3337 };
3338
3339 int save_model(const char *model_file_name, const struct model *model_)
3340 {
3341         int i;
3342         int nr_feature=model_->nr_feature;
3343         int n;
3344         const parameter& param = model_->param;
3345
3346         if(model_->bias>=0)
3347                 n=nr_feature+1;
3348         else
3349                 n=nr_feature;
3350         int w_size = n;
3351         FILE *fp = fopen(model_file_name,"w");
3352         if(fp==NULL) return -1;
3353
3354         char *old_locale = setlocale(LC_ALL, NULL);
3355         if (old_locale)
3356         {
3357                 old_locale = strdup(old_locale);
3358         }
3359         setlocale(LC_ALL, "C");
3360
3361         int nr_w;
3362         if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS)
3363                 nr_w=1;
3364         else
3365                 nr_w=model_->nr_class;
3366
3367         fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]);
3368         fprintf(fp, "nr_class %d\n", model_->nr_class);
3369
3370         if(model_->label)
3371         {
3372                 fprintf(fp, "label");
3373                 for(i=0; i<model_->nr_class; i++)
3374                         fprintf(fp, " %d", model_->label[i]);
3375                 fprintf(fp, "\n");
3376         }
3377
3378         fprintf(fp, "nr_feature %d\n", nr_feature);
3379
3380         fprintf(fp, "bias %.17g\n", model_->bias);
3381
3382         if(check_oneclass_model(model_))
3383                 fprintf(fp, "rho %.17g\n", model_->rho);
3384
3385         fprintf(fp, "w\n");
3386         for(i=0; i<w_size; i++)
3387         {
3388                 int j;
3389                 for(j=0; j<nr_w; j++)
3390                         fprintf(fp, "%.17g ", model_->w[i*nr_w+j]);
3391                 fprintf(fp, "\n");
3392         }
3393
3394         setlocale(LC_ALL, old_locale);
3395         free(old_locale);
3396
3397         if (ferror(fp) != 0 || fclose(fp) != 0) return -1;
3398         else return 0;
3399 }
3400
3401 //
3402 // FSCANF helps to handle fscanf failures.
3403 // Its do-while block avoids the ambiguity when
3404 // if (...)
3405 //    FSCANF();
3406 // is used
3407 //
3408 #define FSCANF(_stream, _format, _var)do\
3409 {\
3410         if (fscanf(_stream, _format, _var) != 1)\
3411         {\
3412                 fprintf(stderr, "ERROR: fscanf failed to read the model\n");\
3413                 EXIT_LOAD_MODEL()\
3414         }\
3415 }while(0)
3416 // EXIT_LOAD_MODEL should NOT end with a semicolon.
3417 #define EXIT_LOAD_MODEL()\
3418 {\
3419         setlocale(LC_ALL, old_locale);\
3420         free(model_->label);\
3421         free(model_);\
3422         free(old_locale);\
3423         return NULL;\
3424 }
3425 struct model *load_model(const char *model_file_name)
3426 {
3427         FILE *fp = fopen(model_file_name,"r");
3428         if(fp==NULL) return NULL;
3429
3430         int i;
3431         int nr_feature;
3432         int n;
3433         int nr_class;
3434         double bias;
3435         double rho;
3436         model *model_ = Malloc(model,1);
3437         parameter& param = model_->param;
3438         // parameters for training only won't be assigned, but arrays are assigned as NULL for safety
3439         param.nr_weight = 0;
3440         param.weight_label = NULL;
3441         param.weight = NULL;
3442         param.init_sol = NULL;
3443
3444         model_->label = NULL;
3445
3446         char *old_locale = setlocale(LC_ALL, NULL);
3447         if (old_locale)
3448         {
3449                 old_locale = strdup(old_locale);
3450         }
3451         setlocale(LC_ALL, "C");
3452
3453         char cmd[81];
3454         while(1)
3455         {
3456                 FSCANF(fp,"%80s",cmd);
3457                 if(strcmp(cmd,"solver_type")==0)
3458                 {
3459                         FSCANF(fp,"%80s",cmd);
3460                         int i;
3461                         for(i=0;solver_type_table[i];i++)
3462                         {
3463                                 if(strcmp(solver_type_table[i],cmd)==0)
3464                                 {
3465                                         param.solver_type=i;
3466                                         break;
3467                                 }
3468                         }
3469                         if(solver_type_table[i] == NULL)
3470                         {
3471                                 fprintf(stderr,"unknown solver type.\n");
3472                                 EXIT_LOAD_MODEL()
3473                         }
3474                 }
3475                 else if(strcmp(cmd,"nr_class")==0)
3476                 {
3477                         FSCANF(fp,"%d",&nr_class);
3478                         model_->nr_class=nr_class;
3479                 }
3480                 else if(strcmp(cmd,"nr_feature")==0)
3481                 {
3482                         FSCANF(fp,"%d",&nr_feature);
3483                         model_->nr_feature=nr_feature;
3484                 }
3485                 else if(strcmp(cmd,"bias")==0)
3486                 {
3487                         FSCANF(fp,"%lf",&bias);
3488                         model_->bias=bias;
3489                 }
3490                 else if(strcmp(cmd,"rho")==0)
3491                 {
3492                         FSCANF(fp,"%lf",&rho);
3493                         model_->rho=rho;
3494                 }
3495                 else if(strcmp(cmd,"w")==0)
3496                 {
3497                         break;
3498                 }
3499                 else if(strcmp(cmd,"label")==0)
3500                 {
3501                         int nr_class = model_->nr_class;
3502                         model_->label = Malloc(int,nr_class);
3503                         for(int i=0;i<nr_class;i++)
3504                                 FSCANF(fp,"%d",&model_->label[i]);
3505                 }
3506                 else
3507                 {
3508                         fprintf(stderr,"unknown text in model file: [%s]\n",cmd);
3509                         EXIT_LOAD_MODEL()
3510                 }
3511         }
3512
3513         nr_feature=model_->nr_feature;
3514         if(model_->bias>=0)
3515                 n=nr_feature+1;
3516         else
3517                 n=nr_feature;
3518         int w_size = n;
3519         int nr_w;
3520         if(nr_class==2 && param.solver_type != MCSVM_CS)
3521                 nr_w = 1;
3522         else
3523                 nr_w = nr_class;
3524
3525         model_->w=Malloc(double, w_size*nr_w);
3526         for(i=0; i<w_size; i++)
3527         {
3528                 int j;
3529                 for(j=0; j<nr_w; j++)
3530                         FSCANF(fp, "%lf ", &model_->w[i*nr_w+j]);
3531         }
3532
3533         setlocale(LC_ALL, old_locale);
3534         free(old_locale);
3535
3536         if (ferror(fp) != 0 || fclose(fp) != 0) return NULL;
3537
3538         return model_;
3539 }
3540
3541 int get_nr_feature(const model *model_)
3542 {
3543         return model_->nr_feature;
3544 }
3545
3546 int get_nr_class(const model *model_)
3547 {
3548         return model_->nr_class;
3549 }
3550
3551 void get_labels(const model *model_, int* label)
3552 {
3553         if (model_->label != NULL)
3554                 for(int i=0;i<model_->nr_class;i++)
3555                         label[i] = model_->label[i];
3556 }
3557
3558 // use inline here for better performance (around 20% faster than the non-inline one)
3559 static inline double get_w_value(const struct model *model_, int idx, int label_idx)
3560 {
3561         int nr_class = model_->nr_class;
3562         int solver_type = model_->param.solver_type;
3563         const double *w = model_->w;
3564
3565         if(idx < 0 || idx > model_->nr_feature)
3566                 return 0;
3567         if(check_regression_model(model_) || check_oneclass_model(model_))
3568                 return w[idx];
3569         else
3570         {
3571                 if(label_idx < 0 || label_idx >= nr_class)
3572                         return 0;
3573                 if(nr_class == 2 && solver_type != MCSVM_CS)
3574                 {
3575                         if(label_idx == 0)
3576                                 return w[idx];
3577                         else
3578                                 return -w[idx];
3579                 }
3580                 else
3581                         return w[idx*nr_class+label_idx];
3582         }
3583 }
3584
3585 // feat_idx: starting from 1 to nr_feature
3586 // label_idx: starting from 0 to nr_class-1 for classification models;
3587 //            for regression and one-class SVM models, label_idx is
3588 //            ignored.
3589 double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx)
3590 {
3591         if(feat_idx > model_->nr_feature)
3592                 return 0;
3593         return get_w_value(model_, feat_idx-1, label_idx);
3594 }
3595
3596 double get_decfun_bias(const struct model *model_, int label_idx)
3597 {
3598         if(check_oneclass_model(model_))
3599         {
3600                 fprintf(stderr, "ERROR: get_decfun_bias can not be called for a one-class SVM model\n");
3601                 return 0;
3602         }
3603         int bias_idx = model_->nr_feature;
3604         double bias = model_->bias;
3605         if(bias <= 0)
3606                 return 0;
3607         else
3608                 return bias*get_w_value(model_, bias_idx, label_idx);
3609 }
3610
3611 double get_decfun_rho(const struct model *model_)
3612 {
3613         if(check_oneclass_model(model_))
3614                 return model_->rho;
3615         else
3616         {
3617                 fprintf(stderr, "ERROR: get_decfun_rho can be called only for a one-class SVM model\n");
3618                 return 0;
3619         }
3620 }
3621
3622 void free_model_content(struct model *model_ptr)
3623 {
3624         if(model_ptr->w != NULL)
3625                 free(model_ptr->w);
3626         if(model_ptr->label != NULL)
3627                 free(model_ptr->label);
3628 }
3629
3630 void free_and_destroy_model(struct model **model_ptr_ptr)
3631 {
3632         struct model *model_ptr = *model_ptr_ptr;
3633         if(model_ptr != NULL)
3634         {
3635                 free_model_content(model_ptr);
3636                 free(model_ptr);
3637         }
3638 }
3639
3640 void destroy_param(parameter* param)
3641 {
3642         if(param->weight_label != NULL)
3643                 free(param->weight_label);
3644         if(param->weight != NULL)
3645                 free(param->weight);
3646         if(param->init_sol != NULL)
3647                 free(param->init_sol);
3648 }
3649
3650 const char *check_parameter(const problem *prob, const parameter *param)
3651 {
3652         if(param->eps <= 0)
3653                 return "eps <= 0";
3654
3655         if(param->C <= 0)
3656                 return "C <= 0";
3657
3658         if(param->p < 0)
3659                 return "p < 0";
3660
3661         if(prob->bias >= 0 && param->solver_type == ONECLASS_SVM)
3662                 return "prob->bias >=0, but this is ignored in ONECLASS_SVM";
3663
3664         if(param->regularize_bias == 0)
3665         {
3666                 if(prob->bias != 1.0)
3667                         return "To not regularize bias, must specify -B 1 along with -R";
3668                 if(param->solver_type != L2R_LR
3669                         && param->solver_type != L2R_L2LOSS_SVC
3670                         && param->solver_type != L1R_L2LOSS_SVC
3671                         && param->solver_type != L1R_LR
3672                         && param->solver_type != L2R_L2LOSS_SVR)
3673                         return "-R option supported only for solver L2R_LR, L2R_L2LOSS_SVC, L1R_L2LOSS_SVC, L1R_LR, and L2R_L2LOSS_SVR";
3674         }
3675
3676         if(param->solver_type != L2R_LR
3677                 && param->solver_type != L2R_L2LOSS_SVC_DUAL
3678                 && param->solver_type != L2R_L2LOSS_SVC
3679                 && param->solver_type != L2R_L1LOSS_SVC_DUAL
3680                 && param->solver_type != MCSVM_CS
3681                 && param->solver_type != L1R_L2LOSS_SVC
3682                 && param->solver_type != L1R_LR
3683                 && param->solver_type != L2R_LR_DUAL
3684                 && param->solver_type != L2R_L2LOSS_SVR
3685                 && param->solver_type != L2R_L2LOSS_SVR_DUAL
3686                 && param->solver_type != L2R_L1LOSS_SVR_DUAL
3687                 && param->solver_type != ONECLASS_SVM)
3688                 return "unknown solver type";
3689
3690         if(param->init_sol != NULL
3691                 && param->solver_type != L2R_LR
3692                 && param->solver_type != L2R_L2LOSS_SVC
3693                 && param->solver_type != L2R_L2LOSS_SVR)
3694                 return "Initial-solution specification supported only for solvers L2R_LR, L2R_L2LOSS_SVC, and L2R_L2LOSS_SVR";
3695
3696         return NULL;
3697 }
3698
3699 int check_probability_model(const struct model *model_)
3700 {
3701         return (model_->param.solver_type==L2R_LR ||
3702                         model_->param.solver_type==L2R_LR_DUAL ||
3703                         model_->param.solver_type==L1R_LR);
3704 }
3705
3706 int check_regression_model(const struct model *model_)
3707 {
3708         return (model_->param.solver_type==L2R_L2LOSS_SVR ||
3709                         model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
3710                         model_->param.solver_type==L2R_L2LOSS_SVR_DUAL);
3711 }
3712
3713 int check_oneclass_model(const struct model *model_)
3714 {
3715         return model_->param.solver_type == ONECLASS_SVM;
3716 }
3717
3718 void set_print_string_function(void (*print_func)(const char*))
3719 {
3720         if (print_func == NULL)
3721                 liblinear_print_string = &print_string_stdout;
3722         else
3723                 liblinear_print_string = print_func;
3724 }
3725