4 sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
5 from liblinear import *
6 from liblinear import __all__ as liblinear_all
7 from liblinear import scipy, sparse
8 from commonutil import *
9 from commonutil import __all__ as common_all
10 from ctypes import c_double
12 if sys.version_info[0] < 3:
14 from itertools import izip as zip
15 _cstr = lambda s: s.encode("utf-8") if isinstance(s,unicode) else str(s)
17 _cstr = lambda s: bytes(s, "utf-8")
19 __all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
22 def load_model(model_file_name):
24 load_model(model_file_name) -> model
26 Load a LIBLINEAR model from model_file_name and return.
28 model = liblinear.load_model(_cstr(model_file_name))
30 print("can't open model file %s" % model_file_name)
32 model = toPyModel(model)
35 def save_model(model_file_name, model):
37 save_model(model_file_name, model) -> None
39 Save a LIBLINEAR model to the file model_file_name.
41 liblinear.save_model(_cstr(model_file_name), model)
43 def train(arg1, arg2=None, arg3=None):
45 train(y, x [, options]) -> model | ACC
47 y: a list/tuple/ndarray of l true labels (type must be int/double).
49 x: 1. a list/tuple of l training instances. Feature vector of
50 each training instance is a list/tuple or dictionary.
52 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
54 train(prob [, options]) -> model | ACC
55 train(prob, param) -> model | ACC
57 Train a model from data (y, x) or a problem prob using
58 'options' or a parameter param.
60 If '-v' is specified in 'options' (i.e., cross validation)
61 either accuracy (ACC) or mean-squared error (MSE) is returned.
64 -s type : set type of solver (default 1)
65 for multi-class classification
66 0 -- L2-regularized logistic regression (primal)
67 1 -- L2-regularized L2-loss support vector classification (dual)
68 2 -- L2-regularized L2-loss support vector classification (primal)
69 3 -- L2-regularized L1-loss support vector classification (dual)
70 4 -- support vector classification by Crammer and Singer
71 5 -- L1-regularized L2-loss support vector classification
72 6 -- L1-regularized logistic regression
73 7 -- L2-regularized logistic regression (dual)
75 11 -- L2-regularized L2-loss support vector regression (primal)
76 12 -- L2-regularized L2-loss support vector regression (dual)
77 13 -- L2-regularized L1-loss support vector regression (dual)
78 -c cost : set the parameter C (default 1)
79 -p epsilon : set the epsilon in loss function of SVR (default 0.1)
80 -e epsilon : set tolerance of termination criterion
82 |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
83 where f is the primal function, (default 0.01)
85 |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001)
87 Dual maximal violation <= eps; similar to liblinear (default 0.)
89 |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
90 where f is the primal function (default 0.01)
92 |f'(alpha)|_1 <= eps |f'(alpha0)|,
93 where f is the dual function (default 0.1)
94 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
95 -wi weight: weights adjust the parameter C of different classes (see README for details)
96 -v n: n-fold cross validation mode
97 "-C : find parameters (C for -s 0, 2 and C, p for -s 11)\n"
98 -q : quiet mode (no outputs)
100 prob, param = None, None
101 if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)):
102 assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix)))
103 y, x, options = arg1, arg2, arg3
105 param = parameter(options)
106 elif isinstance(arg1, problem):
108 if isinstance(arg2, parameter):
111 param = parameter(arg2)
112 if prob == None or param == None :
113 raise TypeError("Wrong types for the arguments")
115 prob.set_bias(param.bias)
116 liblinear.set_print_string_function(param.print_func)
117 err_msg = liblinear.check_parameter(prob, param)
119 raise ValueError('Error: %s' % err_msg)
121 if param.flag_find_parameters:
122 nr_fold = param.nr_fold
125 best_score = c_double()
126 if param.flag_C_specified:
130 if param.flag_p_specified:
134 liblinear.find_parameters(prob, param, nr_fold, start_C, start_p, best_C, best_p, best_score)
135 if param.solver_type in [L2R_LR, L2R_L2LOSS_SVC]:
136 print("Best C = %g CV accuracy = %g%%\n"% (best_C.value, 100.0*best_score.value))
137 elif param.solver_type in [L2R_L2LOSS_SVR]:
138 print("Best C = %g Best p = %g CV MSE = %g\n"% (best_C.value, best_p.value, best_score.value))
139 return best_C.value,best_p.value,best_score.value
142 elif param.flag_cross_validation:
143 l, nr_fold = prob.l, param.nr_fold
144 target = (c_double * l)()
145 liblinear.cross_validation(prob, param, nr_fold, target)
146 ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
147 if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
148 print("Cross Validation Mean squared error = %g" % MSE)
149 print("Cross Validation Squared correlation coefficient = %g" % SCC)
152 print("Cross Validation Accuracy = %g%%" % ACC)
155 m = liblinear.train(prob, param)
160 def predict(y, x, m, options=""):
162 predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
164 y: a list/tuple/ndarray of l true labels (type must be int/double).
165 It is used for calculating the accuracy. Use [] if true labels are
168 x: 1. a list/tuple of l training instances. Feature vector of
169 each training instance is a list/tuple or dictionary.
171 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
173 Predict data (y, x) with the SVM model m.
175 -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
176 -q quiet mode (no outputs)
178 The return tuple contains
179 p_labels: a list of predicted labels
180 p_acc: a tuple including accuracy (for classification), mean-squared
181 error, and squared correlation coefficient (for regression).
182 p_vals: a list of decision values or probability estimates (if '-b 1'
183 is specified). If k is the number of classes, for decision values,
184 each element includes results of predicting k binary-class
185 SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value
186 is returned. For probabilities, each element contains k values
187 indicating the probability that the testing instance is in each class.
188 Note that the order of classes here is the same as 'model.label'
189 field in the model structure.
195 if scipy and isinstance(x, scipy.ndarray):
196 x = scipy.ascontiguousarray(x) # enforce row-major
197 elif sparse and isinstance(x, sparse.spmatrix):
199 elif not isinstance(x, (list, tuple)):
200 raise TypeError("type of x: {0} is not supported!".format(type(x)))
202 if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
203 raise TypeError("type of y: {0} is not supported!".format(type(y)))
205 predict_probability = 0
206 argv = options.split()
211 predict_probability = int(argv[i])
212 elif argv[i] == '-q':
215 raise ValueError("Wrong options")
218 solver_type = m.param.solver_type
219 nr_class = m.get_nr_class()
220 nr_feature = m.get_nr_feature()
221 is_prob_model = m.is_probability_model()
224 biasterm = feature_node(nr_feature+1, bias)
226 biasterm = feature_node(-1, bias)
230 if scipy and isinstance(x, sparse.spmatrix):
231 nr_instance = x.shape[0]
235 if predict_probability:
236 if not is_prob_model:
237 raise TypeError('probability output is only supported for logistic regression')
238 prob_estimates = (c_double * nr_class)()
239 for i in range(nr_instance):
240 if scipy and isinstance(x, sparse.spmatrix):
241 indslice = slice(x.indptr[i], x.indptr[i+1])
242 xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
244 xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
246 label = liblinear.predict_probability(m, xi, prob_estimates)
247 values = prob_estimates[:nr_class]
248 pred_labels += [label]
249 pred_values += [values]
254 nr_classifier = nr_class
255 dec_values = (c_double * nr_classifier)()
256 for i in range(nr_instance):
257 if scipy and isinstance(x, sparse.spmatrix):
258 indslice = slice(x.indptr[i], x.indptr[i+1])
259 xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
261 xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
263 label = liblinear.predict_values(m, xi, dec_values)
264 values = dec_values[:nr_classifier]
265 pred_labels += [label]
266 pred_values += [values]
269 y = [0] * nr_instance
270 ACC, MSE, SCC = evaluations(y, pred_labels)
272 if m.is_regression_model():
273 info("Mean squared error = %g (regression)" % MSE)
274 info("Squared correlation coefficient = %g (regression)" % SCC)
276 info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance))
278 return pred_labels, (ACC, MSE, SCC), pred_values