Quick Start
===========
+"Quick Start with Scipy" is in the next section.
+
There are two levels of usage. The high-level one uses utility functions
in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface.
>>> x0, max_idx = gen_feature_nodearray({1:1, 3:1})
>>> label = liblinear.predict(m, x0)
+Quick Start with Scipy
+======================
+
+Make sure you have Scipy installed to proceed in this section.
+If numba (http://numba.pydata.org) is installed, some operations will be much faster.
+
+There are two levels of usage. The high-level one uses utility functions
+in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface.
+
+>>> import scipy
+>>> from liblinearutil import *
+# Read data in LIBSVM format
+>>> y, x = svm_read_problem('../heart_scale', return_scipy = True) # y: ndarray, x: csr_matrix
+>>> m = train(y[:200], x[:200, :], '-c 4')
+>>> p_label, p_acc, p_val = predict(y[200:], x[200:, :], m)
+
+# Construct problem in Scipy format
+# Dense data: numpy ndarray
+>>> y, x = scipy.asarray([1,-1]), scipy.asarray([[1,0,1], [-1,0,-1]])
+# Sparse data: scipy csr_matrix((data, (row_ind, col_ind))
+>>> y, x = scipy.asarray([1,-1]), scipy.sparse.csr_matrix(([1, 1, -1, -1], ([0, 0, 1, 1], [0, 2, 0, 2])))
+>>> prob = problem(y, x)
+>>> param = parameter('-s 0 -c 4 -B 1')
+>>> m = train(prob, param)
+
+# Other utility functions
+>>> save_model('heart_scale.model', m)
+>>> m = load_model('heart_scale.model')
+>>> p_label, p_acc, p_val = predict(y, x, m, '-b 1')
+>>> ACC, MSE, SCC = evaluations(y, p_label)
+
+# Getting online help
+>>> help(train)
+
+The low-level use directly calls C interfaces imported by liblinear.py. Note that
+all arguments and return values are in ctypes format. You need to handle them
+carefully.
+
+>>> from liblinear import *
+>>> prob = problem(scipy.asarray([1,-1]), scipy.sparse.csr_matrix(([1, 1, -1, -1], ([0, 0, 1, 1], [0, 2, 0, 2]))))
+>>> param = parameter('-c 4')
+>>> m = liblinear.train(prob, param) # m is a ctype pointer to a model
+# Convert a tuple of ndarray (index, data) to feature_nodearray, a ctypes structure
+# Note that index starts from 0, though the following example will be changed to 1:1, 3:1 internally
+>>> x0, max_idx = gen_feature_nodearray((scipy.asarray([0,2]), scipy.asarray([1,1])))
+>>> label = liblinear.predict(m, x0)
+
Design Description
==================
>>> print(node)
-- Function: gen_feature_nodearray(xi [,feature_max=None [,issparse=True]])
+- Function: gen_feature_nodearray(xi [,feature_max=None])
- Generate a feature vector from a Python list/tuple or a dictionary:
+ Generate a feature vector from a Python list/tuple/dictionary, numpy ndarray or tuple of (index, data):
- >>> xi, max_idx = gen_feature_nodearray({1:1, 3:1, 5:-2})
+ >>> xi_ctype, max_idx = gen_feature_nodearray({1:1, 3:1, 5:-2})
- xi: the returned feature_nodearray (a ctypes structure)
+ xi_ctype: the returned feature_nodearray (a ctypes structure)
max_idx: the maximal feature index of xi
- issparse: if issparse == True, zero feature values are removed. The default
- value is True for the sparsity.
-
feature_max: if feature_max is assigned, features with indices larger than
feature_max are removed.
>>> prob = problem(y, x [,bias=-1])
- y: a Python list/tuple of l labels (type must be int/double).
+ y: a Python list/tuple/ndarray of l labels (type must be int/double).
- x: a Python list/tuple of l data instances. Each element of x must be
- an instance of list/tuple/dictionary type.
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
+
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term
added (default -1)
>>> model = train(prob [, 'training_options'])
>>> model = train(prob, param)
- y: a list/tuple of l training labels (type must be int/double).
+ y: a list/tuple/ndarray of l training labels (type must be int/double).
+
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
- x: a list/tuple of l training instances. The feature vector of
- each training instance is an instance of list/tuple or dictionary.
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
training_options: a string in the same form as that for LIBLINEAR command
mode.
>>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options'])
- y: a list/tuple of l true labels (type must be int/double). It is used
- for calculating the accuracy. Use [] if true labels are
+ y: a list/tuple/ndarray of l true labels (type must be int/double).
+ It is used for calculating the accuracy. Use [] if true labels are
unavailable.
- x: a list/tuple of l predicting instances. The feature vector of
- each predicting instance is an instance of list/tuple or dictionary.
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
+
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
predicting_options: a string of predicting options in the same format as
that of LIBLINEAR.
Calculate some evaluations using the true values (ty) and predicted
values (pv):
- >>> (ACC, MSE, SCC) = evaluations(ty, pv)
+ >>> (ACC, MSE, SCC) = evaluations(ty, pv, useScipy)
+
+ ty: a list/tuple/ndarray of true values.
- ty: a list of true values.
+ pv: a list/tuple/ndarray of predicted values.
- pv: a list of predict values.
+ useScipy: convert ty, pv to ndarray, and use scipy functions to do the evaluation
ACC: accuracy.
from os import path
import sys
+try:
+ import scipy
+ from scipy import sparse
+except:
+ scipy = None
+ sparse = None
+
+if sys.version_info[0] < 3:
+ range = xrange
+ from itertools import izip as zip
+
__all__ = ['liblinear', 'feature_node', 'gen_feature_nodearray', 'problem',
'parameter', 'model', 'toPyModel', 'L2R_LR', 'L2R_L2LOSS_SVC_DUAL',
'L2R_L2LOSS_SVC', 'L2R_L1LOSS_SVC_DUAL', 'MCSVM_CS',
def __str__(self):
return '%d:%g' % (self.index, self.value)
-def gen_feature_nodearray(xi, feature_max=None, issparse=True):
- if isinstance(xi, dict):
- index_range = xi.keys()
- elif isinstance(xi, (list, tuple)):
- xi = [0] + xi # idx should start from 1
- index_range = range(1, len(xi))
- else:
- raise TypeError('xi should be a dictionary, list or tuple')
-
+def gen_feature_nodearray(xi, feature_max=None):
if feature_max:
assert(isinstance(feature_max, int))
- index_range = filter(lambda j: j <= feature_max, index_range)
- if issparse:
- index_range = filter(lambda j:xi[j] != 0, index_range)
- index_range = sorted(index_range)
- ret = (feature_node * (len(index_range)+2))()
+ xi_shift = 0 # ensure correct indices of xi
+ if scipy and isinstance(xi, tuple) and len(xi) == 2\
+ and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector
+ index_range = xi[0] + 1 # index starts from 1
+ if feature_max:
+ index_range = index_range[scipy.where(index_range <= feature_max)]
+ elif scipy and isinstance(xi, scipy.ndarray):
+ xi_shift = 1
+ index_range = xi.nonzero()[0] + 1 # index starts from 1
+ if feature_max:
+ index_range = index_range[scipy.where(index_range <= feature_max)]
+ elif isinstance(xi, (dict, list, tuple)):
+ if isinstance(xi, dict):
+ index_range = xi.keys()
+ elif isinstance(xi, (list, tuple)):
+ xi_shift = 1
+ index_range = range(1, len(xi) + 1)
+ index_range = filter(lambda j: xi[j-xi_shift] != 0, index_range)
+
+ if feature_max:
+ index_range = filter(lambda j: j <= feature_max, index_range)
+ index_range = sorted(index_range)
+ else:
+ raise TypeError('xi should be a dictionary, list, tuple, 1-d numpy array, or tuple of (index, data)')
+
+ ret = (feature_node*(len(index_range)+2))()
ret[-1].index = -1 # for bias term
ret[-2].index = -1
- for idx, j in enumerate(index_range):
- ret[idx].index = j
- ret[idx].value = xi[j]
+
+ if scipy and isinstance(xi, tuple) and len(xi) == 2\
+ and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector
+ for idx, j in enumerate(index_range):
+ ret[idx].index = j
+ ret[idx].value = (xi[1])[idx]
+ else:
+ for idx, j in enumerate(index_range):
+ ret[idx].index = j
+ ret[idx].value = xi[j - xi_shift]
+
max_idx = 0
- if index_range :
+ if len(index_range) > 0:
max_idx = index_range[-1]
return ret, max_idx
+try:
+ from numba import jit
+ jit_enabled = True
+except:
+ jit = lambda x: x
+ jit_enabled = False
+
+@jit
+def csr_to_problem_jit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr):
+ for i in range(l):
+ b1,e1 = x_rowptr[i], x_rowptr[i+1]
+ b2,e2 = prob_rowptr[i], prob_rowptr[i+1]-2
+ for j in range(b1,e1):
+ prob_ind[j-b1+b2] = x_ind[j]+1
+ prob_val[j-b1+b2] = x_val[j]
+def csr_to_problem_nojit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr):
+ for i in range(l):
+ x_slice = slice(x_rowptr[i], x_rowptr[i+1])
+ prob_slice = slice(prob_rowptr[i], prob_rowptr[i+1]-2)
+ prob_ind[prob_slice] = x_ind[x_slice]+1
+ prob_val[prob_slice] = x_val[x_slice]
+
+def csr_to_problem(x, prob):
+ # Extra space for termination node and (possibly) bias term
+ x_space = prob.x_space = scipy.empty((x.nnz+x.shape[0]*2), dtype=feature_node)
+ prob.rowptr = x.indptr.copy()
+ prob.rowptr[1:] += 2*scipy.arange(1,x.shape[0]+1)
+ prob_ind = x_space["index"]
+ prob_val = x_space["value"]
+ prob_ind[:] = -1
+ if jit_enabled:
+ csr_to_problem_jit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr)
+ else:
+ csr_to_problem_nojit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr)
+
class problem(Structure):
_names = ["l", "n", "y", "x", "bias"]
_types = [c_int, c_int, POINTER(c_double), POINTER(POINTER(feature_node)), c_double]
_fields_ = genFields(_names, _types)
def __init__(self, y, x, bias = -1):
- if len(y) != len(x) :
- raise ValueError("len(y) != len(x)")
+ if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
+ raise TypeError("type of y: {0} is not supported!".format(type(y)))
+
+ if isinstance(x, (list, tuple)):
+ if len(y) != len(x):
+ raise ValueError("len(y) != len(x)")
+ elif scipy != None and isinstance(x, (scipy.ndarray, sparse.spmatrix)):
+ if len(y) != x.shape[0]:
+ raise ValueError("len(y) != len(x)")
+ if isinstance(x, scipy.ndarray):
+ x = scipy.ascontiguousarray(x) # enforce row-major
+ if isinstance(x, sparse.spmatrix):
+ x = x.tocsr()
+ pass
+ else:
+ raise TypeError("type of x: {0} is not supported!".format(type(x)))
self.l = l = len(y)
self.bias = -1
max_idx = 0
x_space = self.x_space = []
- for i, xi in enumerate(x):
- tmp_xi, tmp_idx = gen_feature_nodearray(xi)
- x_space += [tmp_xi]
- max_idx = max(max_idx, tmp_idx)
+ if scipy != None and isinstance(x, sparse.csr_matrix):
+ csr_to_problem(x, self)
+ max_idx = x.shape[1]
+ else:
+ for i, xi in enumerate(x):
+ tmp_xi, tmp_idx = gen_feature_nodearray(xi)
+ x_space += [tmp_xi]
+ max_idx = max(max_idx, tmp_idx)
self.n = max_idx
self.y = (c_double * l)()
- for i, yi in enumerate(y): self.y[i] = y[i]
+ if scipy != None and isinstance(y, scipy.ndarray):
+ scipy.ctypeslib.as_array(self.y, (self.l,))[:] = y
+ else:
+ for i, yi in enumerate(y): self.y[i] = yi
self.x = (POINTER(feature_node) * l)()
- for i, xi in enumerate(self.x_space): self.x[i] = xi
+ if scipy != None and isinstance(x, sparse.csr_matrix):
+ base = addressof(self.x_space.ctypes.data_as(POINTER(feature_node))[0])
+ x_ptr = cast(self.x, POINTER(c_uint64))
+ x_ptr = scipy.ctypeslib.as_array(x_ptr,(self.l,))
+ x_ptr[:] = self.rowptr[:-1]*sizeof(feature_node)+base
+ else:
+ for i, xi in enumerate(self.x_space): self.x[i] = xi
self.set_bias(bias)
self.n -= 1
node = feature_node(-1, bias)
- for xi in self.x_space:
- xi[-2] = node
+ if isinstance(self.x_space, list):
+ for xi in self.x_space:
+ xi[-2] = node
+ else:
+ self.x_space["index"][self.rowptr[1:]-2] = node.index
+ self.x_space["value"][self.rowptr[1:]-2] = node.value
+
self.bias = bias
elif argv[i] == "-C":
self.flag_find_C = True
- else :
+ else:
raise ValueError("Wrong options")
i += 1
sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
from liblinear import *
from liblinear import __all__ as liblinear_all
+from liblinear import scipy, sparse
from ctypes import c_double
+if sys.version_info[0] < 3:
+ range = xrange
+ from itertools import izip as zip
+
__all__ = ['svm_read_problem', 'load_model', 'save_model', 'evaluations',
'train', 'predict'] + liblinear_all
-def svm_read_problem(data_file_name):
+def svm_read_problem(data_file_name, return_scipy=False):
"""
- svm_read_problem(data_file_name) -> [y, x]
+ svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
+ svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix
Read LIBSVM-format data from data_file_name and return labels y
and data instances x.
"""
prob_y = []
prob_x = []
- for line in open(data_file_name):
+ row_ptr = [0]
+ col_idx = []
+ for i, line in enumerate(open(data_file_name)):
line = line.split(None, 1)
# In case an instance with all zero features
if len(line) == 1: line += ['']
label, features = line
- xi = {}
- for e in features.split():
- ind, val = e.split(":")
- xi[int(ind)] = float(val)
prob_y += [float(label)]
- prob_x += [xi]
+ if scipy != None and return_scipy:
+ nz = 0
+ for e in features.split():
+ ind, val = e.split(":")
+ val = float(val)
+ if val != 0:
+ col_idx += [int(ind)-1]
+ prob_x += [val]
+ nz += 1
+ row_ptr += [row_ptr[-1]+nz]
+ else:
+ xi = {}
+ for e in features.split():
+ ind, val = e.split(":")
+ if val != 0:
+ xi[int(ind)] = float(val)
+ prob_x += [xi]
+ if scipy != None and return_scipy:
+ prob_y = scipy.array(prob_y)
+ prob_x = scipy.array(prob_x)
+ col_idx = scipy.array(col_idx)
+ row_ptr = scipy.array(row_ptr)
+ prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
return (prob_y, prob_x)
def load_model(model_file_name):
"""
liblinear.save_model(model_file_name.encode(), model)
-def evaluations(ty, pv):
+def evaluations_scipy(ty, pv):
"""
- evaluations(ty, pv) -> (ACC, MSE, SCC)
+ evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
+ ty, pv: ndarray
Calculate accuracy, mean squared error and squared correlation coefficient
using the true values (ty) and predicted values (pv).
"""
+ if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
+ raise TypeError("type of ty and pv must be ndarray")
if len(ty) != len(pv):
- raise ValueError("len(ty) must equal to len(pv)")
+ raise ValueError("len(ty) must be equal to len(pv)")
+ ACC = 100.0*(ty == pv).mean()
+ MSE = ((ty - pv)**2).mean()
+ l = len(ty)
+ sumv = pv.sum()
+ sumy = ty.sum()
+ sumvy = (pv*ty).sum()
+ sumvv = (pv*pv).sum()
+ sumyy = (ty*ty).sum()
+ with scipy.errstate(all = 'raise'):
+ try:
+ SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+ except:
+ SCC = float('nan')
+ return (float(ACC), float(MSE), float(SCC))
+
+def evaluations(ty, pv, useScipy = True):
+ """
+ evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
+ ty, pv: list, tuple or ndarray
+ useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
+
+ Calculate accuracy, mean squared error and squared correlation coefficient
+ using the true values (ty) and predicted values (pv).
+ """
+ if scipy != None and useScipy:
+ return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
+ if len(ty) != len(pv):
+ raise ValueError("len(ty) must be equal to len(pv)")
total_correct = total_error = 0
sumv = sumy = sumvv = sumyy = sumvy = 0
for v, y in zip(pv, ty):
SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
except:
SCC = float('nan')
- return (ACC, MSE, SCC)
+ return (float(ACC), float(MSE), float(SCC))
def train(arg1, arg2=None, arg3=None):
"""
train(y, x [, options]) -> model | ACC
+
+ y: a list/tuple/ndarray of l true labels (type must be int/double).
+
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
+
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+
train(prob [, options]) -> model | ACC
train(prob, param) -> model | ACC
Train a model from data (y, x) or a problem prob using
'options' or a parameter param.
+
If '-v' is specified in 'options' (i.e., cross validation)
either accuracy (ACC) or mean-squared error (MSE) is returned.
-q : quiet mode (no outputs)
"""
prob, param = None, None
- if isinstance(arg1, (list, tuple)):
- assert isinstance(arg2, (list, tuple))
+ if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)):
+ assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix)))
y, x, options = arg1, arg2, arg3
prob = problem(y, x)
param = parameter(options)
prob = arg1
if isinstance(arg2, parameter):
param = arg2
- else :
+ else:
param = parameter(arg2)
if prob == None or param == None :
raise TypeError("Wrong types for the arguments")
else:
print("Cross Validation Accuracy = %g%%" % ACC)
return ACC
- else :
+ else:
m = liblinear.train(prob, param)
m = toPyModel(m)
"""
predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
+ y: a list/tuple/ndarray of l true labels (type must be int/double).
+ It is used for calculating the accuracy. Use [] if true labels are
+ unavailable.
+
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
+
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+
Predict data (y, x) with the SVM model m.
options:
-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
def info(s):
print(s)
+ if scipy and isinstance(x, scipy.ndarray):
+ x = scipy.ascontiguousarray(x) # enforce row-major
+ elif sparse and isinstance(x, sparse.spmatrix):
+ x = x.tocsr()
+ elif not isinstance(x, (list, tuple)):
+ raise TypeError("type of x: {0} is not supported!".format(type(x)))
+
+ if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
+ raise TypeError("type of y: {0} is not supported!".format(type(y)))
+
predict_probability = 0
argv = options.split()
i = 0
pred_labels = []
pred_values = []
+ if scipy and isinstance(x, sparse.spmatrix):
+ nr_instance = x.shape[0]
+ else:
+ nr_instance = len(x)
+
if predict_probability:
if not is_prob_model:
raise TypeError('probability output is only supported for logistic regression')
prob_estimates = (c_double * nr_class)()
- for xi in x:
- xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature)
+ for i in range(nr_instance):
+ if scipy and isinstance(x, sparse.spmatrix):
+ indslice = slice(x.indptr[i], x.indptr[i+1])
+ xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
+ else:
+ xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
xi[-2] = biasterm
label = liblinear.predict_probability(m, xi, prob_estimates)
values = prob_estimates[:nr_class]
else:
nr_classifier = nr_class
dec_values = (c_double * nr_classifier)()
- for xi in x:
- xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature)
+ for i in range(nr_instance):
+ if scipy and isinstance(x, sparse.spmatrix):
+ indslice = slice(x.indptr[i], x.indptr[i+1])
+ xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
+ else:
+ xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
xi[-2] = biasterm
label = liblinear.predict_values(m, xi, dec_values)
values = dec_values[:nr_classifier]
pred_labels += [label]
pred_values += [values]
+
if len(y) == 0:
- y = [0] * len(x)
+ y = [0] * nr_instance
ACC, MSE, SCC = evaluations(y, pred_labels)
- l = len(y)
+
if m.is_regression_model():
info("Mean squared error = %g (regression)" % MSE)
info("Squared correlation coefficient = %g (regression)" % SCC)
else:
- info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l))
+ info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance))
return pred_labels, (ACC, MSE, SCC), pred_values