granicus.if.org Git - liblinear/blob - python/liblinearutil.py

   1 #!/usr/bin/env python
   2
   3 import os, sys
   4 sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
   5 from liblinear import *
   6 from liblinear import __all__ as liblinear_all
   7 from liblinear import scipy, sparse
   8 from commonutil import *
   9 from commonutil import __all__ as common_all
  10 from ctypes import c_double
  11
  12 if sys.version_info[0] < 3:
  13         range = xrange
  14         from itertools import izip as zip
  15         _cstr = lambda s: s.encode("utf-8") if isinstance(s,unicode) else str(s)
  16 else:
  17         _cstr = lambda s: bytes(s, "utf-8")
  18
  19 __all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
  20
  21
  22 def load_model(model_file_name):
  23         """
  24         load_model(model_file_name) -> model
  25
  26         Load a LIBLINEAR model from model_file_name and return.
  27         """
  28         model = liblinear.load_model(_cstr(model_file_name))
  29         if not model:
  30                 print("can't open model file %s" % model_file_name)
  31                 return None
  32         model = toPyModel(model)
  33         return model
  34
  35 def save_model(model_file_name, model):
  36         """
  37         save_model(model_file_name, model) -> None
  38
  39         Save a LIBLINEAR model to the file model_file_name.
  40         """
  41         liblinear.save_model(_cstr(model_file_name), model)
  42
  43 def train(arg1, arg2=None, arg3=None):
  44         """
  45         train(y, x [, options]) -> model | ACC
  46
  47         y: a list/tuple/ndarray of l true labels (type must be int/double).
  48
  49         x: 1. a list/tuple of l training instances. Feature vector of
  50               each training instance is a list/tuple or dictionary.
  51
  52            2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
  53
  54         train(prob [, options]) -> model | ACC
  55         train(prob, param) -> model | ACC
  56
  57         Train a model from data (y, x) or a problem prob using
  58         'options' or a parameter param.
  59
  60         If '-v' is specified in 'options' (i.e., cross validation)
  61         either accuracy (ACC) or mean-squared error (MSE) is returned.
  62
  63         options:
  64                 -s type : set type of solver (default 1)
  65                   for multi-class classification
  66                          0 -- L2-regularized logistic regression (primal)
  67                          1 -- L2-regularized L2-loss support vector classification (dual)
  68                          2 -- L2-regularized L2-loss support vector classification (primal)
  69                          3 -- L2-regularized L1-loss support vector classification (dual)
  70                          4 -- support vector classification by Crammer and Singer
  71                          5 -- L1-regularized L2-loss support vector classification
  72                          6 -- L1-regularized logistic regression
  73                          7 -- L2-regularized logistic regression (dual)
  74                   for regression
  75                         11 -- L2-regularized L2-loss support vector regression (primal)
  76                         12 -- L2-regularized L2-loss support vector regression (dual)
  77                         13 -- L2-regularized L1-loss support vector regression (dual)
  78                 -c cost : set the parameter C (default 1)
  79                 -p epsilon : set the epsilon in loss function of SVR (default 0.1)
  80                 -e epsilon : set tolerance of termination criterion
  81                         -s 0 and 2
  82                                 |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
  83                                 where f is the primal function, (default 0.01)
  84                         -s 11
  85                                 |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001)
  86                         -s 1, 3, 4, and 7
  87                                 Dual maximal violation <= eps; similar to liblinear (default 0.)
  88                         -s 5 and 6
  89                                 |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
  90                                 where f is the primal function (default 0.01)
  91                         -s 12 and 13
  92                                 |f'(alpha)|_1 <= eps |f'(alpha0)|,
  93                                 where f is the dual function (default 0.1)
  94                 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
  95                 -wi weight: weights adjust the parameter C of different classes (see README for details)
  96                 -v n: n-fold cross validation mode
  97                 "-C : find parameters (C for -s 0, 2 and C, p for -s 11)\n"
  98                 -q : quiet mode (no outputs)
  99         """
 100         prob, param = None, None
 101         if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)):
 102                 assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix)))
 103                 y, x, options = arg1, arg2, arg3
 104                 prob = problem(y, x)
 105                 param = parameter(options)
 106         elif isinstance(arg1, problem):
 107                 prob = arg1
 108                 if isinstance(arg2, parameter):
 109                         param = arg2
 110                 else:
 111                         param = parameter(arg2)
 112         if prob == None or param == None :
 113                 raise TypeError("Wrong types for the arguments")
 114
 115         prob.set_bias(param.bias)
 116         liblinear.set_print_string_function(param.print_func)
 117         err_msg = liblinear.check_parameter(prob, param)
 118         if err_msg :
 119                 raise ValueError('Error: %s' % err_msg)
 120
 121         if param.flag_find_parameters:
 122                 nr_fold = param.nr_fold
 123                 best_C = c_double()
 124                 best_p = c_double()
 125                 best_score = c_double()
 126                 if param.flag_C_specified:
 127                         start_C = param.C
 128                 else:
 129                         start_C = -1.0
 130                 if param.flag_p_specified:
 131                         start_p = param.p
 132                 else:
 133                         start_p = -1.0
 134                 liblinear.find_parameters(prob, param, nr_fold, start_C, start_p, best_C, best_p, best_score)
 135                 if param.solver_type in [L2R_LR, L2R_L2LOSS_SVC]:
 136                         print("Best C = %g  CV accuracy = %g%%\n"% (best_C.value, 100.0*best_score.value))
 137                 elif param.solver_type in [L2R_L2LOSS_SVR]:
 138                         print("Best C = %g Best p = %g  CV MSE = %g\n"% (best_C.value, best_p.value, best_score.value))
 139                 return best_C.value,best_p.value,best_score.value
 140
 141
 142         elif param.flag_cross_validation:
 143                 l, nr_fold = prob.l, param.nr_fold
 144                 target = (c_double * l)()
 145                 liblinear.cross_validation(prob, param, nr_fold, target)
 146                 ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
 147                 if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
 148                         print("Cross Validation Mean squared error = %g" % MSE)
 149                         print("Cross Validation Squared correlation coefficient = %g" % SCC)
 150                         return MSE
 151                 else:
 152                         print("Cross Validation Accuracy = %g%%" % ACC)
 153                         return ACC
 154         else:
 155                 m = liblinear.train(prob, param)
 156                 m = toPyModel(m)
 157
 158                 return m
 159
 160 def predict(y, x, m, options=""):
 161         """
 162         predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
 163
 164         y: a list/tuple/ndarray of l true labels (type must be int/double).
 165            It is used for calculating the accuracy. Use [] if true labels are
 166            unavailable.
 167
 168         x: 1. a list/tuple of l training instances. Feature vector of
 169               each training instance is a list/tuple or dictionary.
 170
 171            2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
 172
 173         Predict data (y, x) with the SVM model m.
 174         options:
 175             -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
 176             -q quiet mode (no outputs)
 177
 178         The return tuple contains
 179         p_labels: a list of predicted labels
 180         p_acc: a tuple including  accuracy (for classification), mean-squared
 181                error, and squared correlation coefficient (for regression).
 182         p_vals: a list of decision values or probability estimates (if '-b 1'
 183                 is specified). If k is the number of classes, for decision values,
 184                 each element includes results of predicting k binary-class
 185                 SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value
 186                 is returned. For probabilities, each element contains k values
 187                 indicating the probability that the testing instance is in each class.
 188                 Note that the order of classes here is the same as 'model.label'
 189                 field in the model structure.
 190         """
 191
 192         def info(s):
 193                 print(s)
 194
 195         if scipy and isinstance(x, scipy.ndarray):
 196                 x = scipy.ascontiguousarray(x) # enforce row-major
 197         elif sparse and isinstance(x, sparse.spmatrix):
 198                 x = x.tocsr()
 199         elif not isinstance(x, (list, tuple)):
 200                 raise TypeError("type of x: {0} is not supported!".format(type(x)))
 201
 202         if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
 203                 raise TypeError("type of y: {0} is not supported!".format(type(y)))
 204
 205         predict_probability = 0
 206         argv = options.split()
 207         i = 0
 208         while i < len(argv):
 209                 if argv[i] == '-b':
 210                         i += 1
 211                         predict_probability = int(argv[i])
 212                 elif argv[i] == '-q':
 213                         info = print_null
 214                 else:
 215                         raise ValueError("Wrong options")
 216                 i+=1
 217
 218         solver_type = m.param.solver_type
 219         nr_class = m.get_nr_class()
 220         nr_feature = m.get_nr_feature()
 221         is_prob_model = m.is_probability_model()
 222         bias = m.bias
 223         if bias >= 0:
 224                 biasterm = feature_node(nr_feature+1, bias)
 225         else:
 226                 biasterm = feature_node(-1, bias)
 227         pred_labels = []
 228         pred_values = []
 229
 230         if scipy and isinstance(x, sparse.spmatrix):
 231                 nr_instance = x.shape[0]
 232         else:
 233                 nr_instance = len(x)
 234
 235         if predict_probability:
 236                 if not is_prob_model:
 237                         raise TypeError('probability output is only supported for logistic regression')
 238                 prob_estimates = (c_double * nr_class)()
 239                 for i in range(nr_instance):
 240                         if scipy and isinstance(x, sparse.spmatrix):
 241                                 indslice = slice(x.indptr[i], x.indptr[i+1])
 242                                 xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
 243                         else:
 244                                 xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
 245                         xi[-2] = biasterm
 246                         label = liblinear.predict_probability(m, xi, prob_estimates)
 247                         values = prob_estimates[:nr_class]
 248                         pred_labels += [label]
 249                         pred_values += [values]
 250         else:
 251                 if nr_class <= 2:
 252                         nr_classifier = 1
 253                 else:
 254                         nr_classifier = nr_class
 255                 dec_values = (c_double * nr_classifier)()
 256                 for i in range(nr_instance):
 257                         if scipy and isinstance(x, sparse.spmatrix):
 258                                 indslice = slice(x.indptr[i], x.indptr[i+1])
 259                                 xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
 260                         else:
 261                                 xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
 262                         xi[-2] = biasterm
 263                         label = liblinear.predict_values(m, xi, dec_values)
 264                         values = dec_values[:nr_classifier]
 265                         pred_labels += [label]
 266                         pred_values += [values]
 267
 268         if len(y) == 0:
 269                 y = [0] * nr_instance
 270         ACC, MSE, SCC = evaluations(y, pred_labels)
 271
 272         if m.is_regression_model():
 273                 info("Mean squared error = %g (regression)" % MSE)
 274                 info("Squared correlation coefficient = %g (regression)" % SCC)
 275         else:
 276                 info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance))
 277
 278         return pred_labels, (ACC, MSE, SCC), pred_values