From: kevin1kevin1k Date: Wed, 10 Feb 2021 10:05:13 +0000 (+0800) Subject: Replace tabs with four spaces in Python files X-Git-Tag: v243~6 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=976b959223a84e3ce56330ac6126e451d9036439;p=liblinear Replace tabs with four spaces in Python files Co-authored-by: Yu-Sheng Li Co-authored-by: Jui-Nan Yen --- diff --git a/python/liblinear/commonutil.py b/python/liblinear/commonutil.py index 881f6bb..c261c42 100644 --- a/python/liblinear/commonutil.py +++ b/python/liblinear/commonutil.py @@ -5,174 +5,174 @@ from array import array import sys try: - import scipy - from scipy import sparse + import scipy + from scipy import sparse except: - scipy = None - sparse = None + scipy = None + sparse = None __all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale'] def svm_read_problem(data_file_name, return_scipy=False): - """ - svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary - svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix - - Read LIBSVM-format data from data_file_name and return labels y - and data instances x. - """ - if scipy != None and return_scipy: - prob_y = array('d') - prob_x = array('d') - row_ptr = array('l', [0]) - col_idx = array('l') - else: - prob_y = [] - prob_x = [] - row_ptr = [0] - col_idx = [] - indx_start = 1 - for i, line in enumerate(open(data_file_name)): - line = line.split(None, 1) - # In case an instance with all zero features - if len(line) == 1: line += [''] - label, features = line - prob_y.append(float(label)) - if scipy != None and return_scipy: - nz = 0 - for e in features.split(): - ind, val = e.split(":") - if ind == '0': - indx_start = 0 - val = float(val) - if val != 0: - col_idx.append(int(ind)-indx_start) - prob_x.append(val) - nz += 1 - row_ptr.append(row_ptr[-1]+nz) - else: - xi = {} - for e in features.split(): - ind, val = e.split(":") - xi[int(ind)] = float(val) - prob_x += [xi] - if scipy != None and return_scipy: - prob_y = scipy.frombuffer(prob_y, dtype='d') - prob_x = scipy.frombuffer(prob_x, dtype='d') - col_idx = scipy.frombuffer(col_idx, dtype='l') - row_ptr = scipy.frombuffer(row_ptr, dtype='l') - prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) - return (prob_y, prob_x) + """ + svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary + svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix + + Read LIBSVM-format data from data_file_name and return labels y + and data instances x. + """ + if scipy != None and return_scipy: + prob_y = array('d') + prob_x = array('d') + row_ptr = array('l', [0]) + col_idx = array('l') + else: + prob_y = [] + prob_x = [] + row_ptr = [0] + col_idx = [] + indx_start = 1 + for i, line in enumerate(open(data_file_name)): + line = line.split(None, 1) + # In case an instance with all zero features + if len(line) == 1: line += [''] + label, features = line + prob_y.append(float(label)) + if scipy != None and return_scipy: + nz = 0 + for e in features.split(): + ind, val = e.split(":") + if ind == '0': + indx_start = 0 + val = float(val) + if val != 0: + col_idx.append(int(ind)-indx_start) + prob_x.append(val) + nz += 1 + row_ptr.append(row_ptr[-1]+nz) + else: + xi = {} + for e in features.split(): + ind, val = e.split(":") + xi[int(ind)] = float(val) + prob_x += [xi] + if scipy != None and return_scipy: + prob_y = scipy.frombuffer(prob_y, dtype='d') + prob_x = scipy.frombuffer(prob_x, dtype='d') + col_idx = scipy.frombuffer(col_idx, dtype='l') + row_ptr = scipy.frombuffer(row_ptr, dtype='l') + prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) + return (prob_y, prob_x) def evaluations_scipy(ty, pv): - """ - evaluations_scipy(ty, pv) -> (ACC, MSE, SCC) - ty, pv: ndarray - - Calculate accuracy, mean squared error and squared correlation coefficient - using the true values (ty) and predicted values (pv). - """ - if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)): - raise TypeError("type of ty and pv must be ndarray") - if len(ty) != len(pv): - raise ValueError("len(ty) must be equal to len(pv)") - ACC = 100.0*(ty == pv).mean() - MSE = ((ty - pv)**2).mean() - l = len(ty) - sumv = pv.sum() - sumy = ty.sum() - sumvy = (pv*ty).sum() - sumvv = (pv*pv).sum() - sumyy = (ty*ty).sum() - with scipy.errstate(all = 'raise'): - try: - SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) - except: - SCC = float('nan') - return (float(ACC), float(MSE), float(SCC)) + """ + evaluations_scipy(ty, pv) -> (ACC, MSE, SCC) + ty, pv: ndarray + + Calculate accuracy, mean squared error and squared correlation coefficient + using the true values (ty) and predicted values (pv). + """ + if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)): + raise TypeError("type of ty and pv must be ndarray") + if len(ty) != len(pv): + raise ValueError("len(ty) must be equal to len(pv)") + ACC = 100.0*(ty == pv).mean() + MSE = ((ty - pv)**2).mean() + l = len(ty) + sumv = pv.sum() + sumy = ty.sum() + sumvy = (pv*ty).sum() + sumvv = (pv*pv).sum() + sumyy = (ty*ty).sum() + with scipy.errstate(all = 'raise'): + try: + SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) + except: + SCC = float('nan') + return (float(ACC), float(MSE), float(SCC)) def evaluations(ty, pv, useScipy = True): - """ - evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC) - ty, pv: list, tuple or ndarray - useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation - - Calculate accuracy, mean squared error and squared correlation coefficient - using the true values (ty) and predicted values (pv). - """ - if scipy != None and useScipy: - return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv)) - if len(ty) != len(pv): - raise ValueError("len(ty) must be equal to len(pv)") - total_correct = total_error = 0 - sumv = sumy = sumvv = sumyy = sumvy = 0 - for v, y in zip(pv, ty): - if y == v: - total_correct += 1 - total_error += (v-y)*(v-y) - sumv += v - sumy += y - sumvv += v*v - sumyy += y*y - sumvy += v*y - l = len(ty) - ACC = 100.0*total_correct/l - MSE = total_error/l - try: - SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) - except: - SCC = float('nan') - return (float(ACC), float(MSE), float(SCC)) + """ + evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC) + ty, pv: list, tuple or ndarray + useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation + + Calculate accuracy, mean squared error and squared correlation coefficient + using the true values (ty) and predicted values (pv). + """ + if scipy != None and useScipy: + return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv)) + if len(ty) != len(pv): + raise ValueError("len(ty) must be equal to len(pv)") + total_correct = total_error = 0 + sumv = sumy = sumvv = sumyy = sumvy = 0 + for v, y in zip(pv, ty): + if y == v: + total_correct += 1 + total_error += (v-y)*(v-y) + sumv += v + sumy += y + sumvv += v*v + sumyy += y*y + sumvy += v*y + l = len(ty) + ACC = 100.0*total_correct/l + MSE = total_error/l + try: + SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) + except: + SCC = float('nan') + return (float(ACC), float(MSE), float(SCC)) def csr_find_scale_param(x, lower=-1, upper=1): - assert isinstance(x, sparse.csr_matrix) - assert lower < upper - l, n = x.shape - feat_min = x.min(axis=0).toarray().flatten() - feat_max = x.max(axis=0).toarray().flatten() - coef = (feat_max - feat_min) / (upper - lower) - coef[coef != 0] = 1.0 / coef[coef != 0] - - # (x - ones(l,1) * feat_min') * diag(coef) + lower - # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower - # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower) - # = x * diag(coef) + ones(l, 1) * offset' - offset = -feat_min * coef + lower - offset[coef == 0] = 0 - - if sum(offset != 0) * l > 3 * x.getnnz(): - print( - "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n" - "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.", - file=sys.stderr) - - return {'coef':coef, 'offset':offset} + assert isinstance(x, sparse.csr_matrix) + assert lower < upper + l, n = x.shape + feat_min = x.min(axis=0).toarray().flatten() + feat_max = x.max(axis=0).toarray().flatten() + coef = (feat_max - feat_min) / (upper - lower) + coef[coef != 0] = 1.0 / coef[coef != 0] + + # (x - ones(l,1) * feat_min') * diag(coef) + lower + # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower + # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower) + # = x * diag(coef) + ones(l, 1) * offset' + offset = -feat_min * coef + lower + offset[coef == 0] = 0 + + if sum(offset != 0) * l > 3 * x.getnnz(): + print( + "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n" + "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.", + file=sys.stderr) + + return {'coef':coef, 'offset':offset} def csr_scale(x, scale_param): - assert isinstance(x, sparse.csr_matrix) + assert isinstance(x, sparse.csr_matrix) - offset = scale_param['offset'] - coef = scale_param['coef'] - assert len(coef) == len(offset) + offset = scale_param['offset'] + coef = scale_param['coef'] + assert len(coef) == len(offset) - l, n = x.shape + l, n = x.shape - if not n == len(coef): - print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr) - coef = resize(coef, n) - offset = resize(offset, n) + if not n == len(coef): + print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr) + coef = resize(coef, n) + offset = resize(offset, n) - # scaled_x = x * diag(coef) + ones(l, 1) * offset' - offset = sparse.csr_matrix(offset.reshape(1, n)) - offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype) - scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset + # scaled_x = x * diag(coef) + ones(l, 1) * offset' + offset = sparse.csr_matrix(offset.reshape(1, n)) + offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype) + scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset - if scaled_x.getnnz() > x.getnnz(): - print( - "WARNING: original #nonzeros %d\n" % x.getnnz() + - " > new #nonzeros %d\n" % scaled_x.getnnz() + - "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.", - file=sys.stderr) + if scaled_x.getnnz() > x.getnnz(): + print( + "WARNING: original #nonzeros %d\n" % x.getnnz() + + " > new #nonzeros %d\n" % scaled_x.getnnz() + + "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.", + file=sys.stderr) - return scaled_x + return scaled_x diff --git a/python/liblinear/liblinear.py b/python/liblinear/liblinear.py index c223129..31b3252 100644 --- a/python/liblinear/liblinear.py +++ b/python/liblinear/liblinear.py @@ -7,15 +7,15 @@ from glob import glob import sys try: - import scipy - from scipy import sparse + import scipy + from scipy import sparse except: - scipy = None - sparse = None + scipy = None + sparse = None if sys.version_info[0] < 3: - range = xrange - from itertools import izip as zip + range = xrange + from itertools import izip as zip __all__ = ['liblinear', 'feature_node', 'gen_feature_nodearray', 'problem', 'parameter', 'model', 'toPyModel', 'L2R_LR', 'L2R_L2LOSS_SVC_DUAL', @@ -25,24 +25,24 @@ __all__ = ['liblinear', 'feature_node', 'gen_feature_nodearray', 'problem', 'print_null'] try: - dirname = path.dirname(path.abspath(__file__)) - dynamic_lib_name = 'clib.cp*' - path_to_so = glob(path.join(dirname, dynamic_lib_name))[0] - liblinear = CDLL(path_to_so) + dirname = path.dirname(path.abspath(__file__)) + dynamic_lib_name = 'clib.cp*' + path_to_so = glob(path.join(dirname, dynamic_lib_name))[0] + liblinear = CDLL(path_to_so) except: - try : - if sys.platform == 'win32': - liblinear = CDLL(path.join(dirname, r'..\..\windows\liblinear.dll')) - else: - liblinear = CDLL(path.join(dirname, '../../liblinear.so.4')) - except: - # For unix the prefix 'lib' is not considered. - if find_library('linear'): - liblinear = CDLL(find_library('linear')) - elif find_library('liblinear'): - liblinear = CDLL(find_library('liblinear')) - else: - raise Exception('LIBLINEAR library not found.') + try : + if sys.platform == 'win32': + liblinear = CDLL(path.join(dirname, r'..\..\windows\liblinear.dll')) + else: + liblinear = CDLL(path.join(dirname, '../../liblinear.so.4')) + except: + # For unix the prefix 'lib' is not considered. + if find_library('linear'): + liblinear = CDLL(find_library('linear')) + elif find_library('liblinear'): + liblinear = CDLL(find_library('liblinear')) + else: + raise Exception('LIBLINEAR library not found.') L2R_LR = 0 L2R_L2LOSS_SVC_DUAL = 1 @@ -59,370 +59,370 @@ ONECLASS_SVM = 21 PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p) def print_null(s): - return + return def genFields(names, types): - return list(zip(names, types)) + return list(zip(names, types)) def fillprototype(f, restype, argtypes): - f.restype = restype - f.argtypes = argtypes + f.restype = restype + f.argtypes = argtypes class feature_node(Structure): - _names = ["index", "value"] - _types = [c_int, c_double] - _fields_ = genFields(_names, _types) + _names = ["index", "value"] + _types = [c_int, c_double] + _fields_ = genFields(_names, _types) - def __str__(self): - return '%d:%g' % (self.index, self.value) + def __str__(self): + return '%d:%g' % (self.index, self.value) def gen_feature_nodearray(xi, feature_max=None): - if feature_max: - assert(isinstance(feature_max, int)) - - xi_shift = 0 # ensure correct indices of xi - if scipy and isinstance(xi, tuple) and len(xi) == 2\ - and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector - index_range = xi[0] + 1 # index starts from 1 - if feature_max: - index_range = index_range[scipy.where(index_range <= feature_max)] - elif scipy and isinstance(xi, scipy.ndarray): - xi_shift = 1 - index_range = xi.nonzero()[0] + 1 # index starts from 1 - if feature_max: - index_range = index_range[scipy.where(index_range <= feature_max)] - elif isinstance(xi, (dict, list, tuple)): - if isinstance(xi, dict): - index_range = xi.keys() - elif isinstance(xi, (list, tuple)): - xi_shift = 1 - index_range = range(1, len(xi) + 1) - index_range = filter(lambda j: xi[j-xi_shift] != 0, index_range) - - if feature_max: - index_range = filter(lambda j: j <= feature_max, index_range) - index_range = sorted(index_range) - else: - raise TypeError('xi should be a dictionary, list, tuple, 1-d numpy array, or tuple of (index, data)') - - ret = (feature_node*(len(index_range)+2))() - ret[-1].index = -1 # for bias term - ret[-2].index = -1 - - if scipy and isinstance(xi, tuple) and len(xi) == 2\ - and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector - for idx, j in enumerate(index_range): - ret[idx].index = j - ret[idx].value = (xi[1])[idx] - else: - for idx, j in enumerate(index_range): - ret[idx].index = j - ret[idx].value = xi[j - xi_shift] - - max_idx = 0 - if len(index_range) > 0: - max_idx = index_range[-1] - return ret, max_idx + if feature_max: + assert(isinstance(feature_max, int)) + + xi_shift = 0 # ensure correct indices of xi + if scipy and isinstance(xi, tuple) and len(xi) == 2\ + and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector + index_range = xi[0] + 1 # index starts from 1 + if feature_max: + index_range = index_range[scipy.where(index_range <= feature_max)] + elif scipy and isinstance(xi, scipy.ndarray): + xi_shift = 1 + index_range = xi.nonzero()[0] + 1 # index starts from 1 + if feature_max: + index_range = index_range[scipy.where(index_range <= feature_max)] + elif isinstance(xi, (dict, list, tuple)): + if isinstance(xi, dict): + index_range = xi.keys() + elif isinstance(xi, (list, tuple)): + xi_shift = 1 + index_range = range(1, len(xi) + 1) + index_range = filter(lambda j: xi[j-xi_shift] != 0, index_range) + + if feature_max: + index_range = filter(lambda j: j <= feature_max, index_range) + index_range = sorted(index_range) + else: + raise TypeError('xi should be a dictionary, list, tuple, 1-d numpy array, or tuple of (index, data)') + + ret = (feature_node*(len(index_range)+2))() + ret[-1].index = -1 # for bias term + ret[-2].index = -1 + + if scipy and isinstance(xi, tuple) and len(xi) == 2\ + and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector + for idx, j in enumerate(index_range): + ret[idx].index = j + ret[idx].value = (xi[1])[idx] + else: + for idx, j in enumerate(index_range): + ret[idx].index = j + ret[idx].value = xi[j - xi_shift] + + max_idx = 0 + if len(index_range) > 0: + max_idx = index_range[-1] + return ret, max_idx try: - from numba import jit - jit_enabled = True + from numba import jit + jit_enabled = True except: - jit = lambda x: x - jit_enabled = False + jit = lambda x: x + jit_enabled = False @jit def csr_to_problem_jit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr): - for i in range(l): - b1,e1 = x_rowptr[i], x_rowptr[i+1] - b2,e2 = prob_rowptr[i], prob_rowptr[i+1]-2 - for j in range(b1,e1): - prob_ind[j-b1+b2] = x_ind[j]+1 - prob_val[j-b1+b2] = x_val[j] + for i in range(l): + b1,e1 = x_rowptr[i], x_rowptr[i+1] + b2,e2 = prob_rowptr[i], prob_rowptr[i+1]-2 + for j in range(b1,e1): + prob_ind[j-b1+b2] = x_ind[j]+1 + prob_val[j-b1+b2] = x_val[j] def csr_to_problem_nojit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr): - for i in range(l): - x_slice = slice(x_rowptr[i], x_rowptr[i+1]) - prob_slice = slice(prob_rowptr[i], prob_rowptr[i+1]-2) - prob_ind[prob_slice] = x_ind[x_slice]+1 - prob_val[prob_slice] = x_val[x_slice] + for i in range(l): + x_slice = slice(x_rowptr[i], x_rowptr[i+1]) + prob_slice = slice(prob_rowptr[i], prob_rowptr[i+1]-2) + prob_ind[prob_slice] = x_ind[x_slice]+1 + prob_val[prob_slice] = x_val[x_slice] def csr_to_problem(x, prob): - # Extra space for termination node and (possibly) bias term - x_space = prob.x_space = scipy.empty((x.nnz+x.shape[0]*2), dtype=feature_node) - prob.rowptr = x.indptr.copy() - prob.rowptr[1:] += 2*scipy.arange(1,x.shape[0]+1) - prob_ind = x_space["index"] - prob_val = x_space["value"] - prob_ind[:] = -1 - if jit_enabled: - csr_to_problem_jit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr) - else: - csr_to_problem_nojit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr) + # Extra space for termination node and (possibly) bias term + x_space = prob.x_space = scipy.empty((x.nnz+x.shape[0]*2), dtype=feature_node) + prob.rowptr = x.indptr.copy() + prob.rowptr[1:] += 2*scipy.arange(1,x.shape[0]+1) + prob_ind = x_space["index"] + prob_val = x_space["value"] + prob_ind[:] = -1 + if jit_enabled: + csr_to_problem_jit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr) + else: + csr_to_problem_nojit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr) class problem(Structure): - _names = ["l", "n", "y", "x", "bias"] - _types = [c_int, c_int, POINTER(c_double), POINTER(POINTER(feature_node)), c_double] - _fields_ = genFields(_names, _types) - - def __init__(self, y, x, bias = -1): - if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))): - raise TypeError("type of y: {0} is not supported!".format(type(y))) - - if isinstance(x, (list, tuple)): - if len(y) != len(x): - raise ValueError("len(y) != len(x)") - elif scipy != None and isinstance(x, (scipy.ndarray, sparse.spmatrix)): - if len(y) != x.shape[0]: - raise ValueError("len(y) != len(x)") - if isinstance(x, scipy.ndarray): - x = scipy.ascontiguousarray(x) # enforce row-major - if isinstance(x, sparse.spmatrix): - x = x.tocsr() - pass - else: - raise TypeError("type of x: {0} is not supported!".format(type(x))) - self.l = l = len(y) - self.bias = -1 - - max_idx = 0 - x_space = self.x_space = [] - if scipy != None and isinstance(x, sparse.csr_matrix): - csr_to_problem(x, self) - max_idx = x.shape[1] - else: - for i, xi in enumerate(x): - tmp_xi, tmp_idx = gen_feature_nodearray(xi) - x_space += [tmp_xi] - max_idx = max(max_idx, tmp_idx) - self.n = max_idx - - self.y = (c_double * l)() - if scipy != None and isinstance(y, scipy.ndarray): - scipy.ctypeslib.as_array(self.y, (self.l,))[:] = y - else: - for i, yi in enumerate(y): self.y[i] = yi - - self.x = (POINTER(feature_node) * l)() - if scipy != None and isinstance(x, sparse.csr_matrix): - base = addressof(self.x_space.ctypes.data_as(POINTER(feature_node))[0]) - x_ptr = cast(self.x, POINTER(c_uint64)) - x_ptr = scipy.ctypeslib.as_array(x_ptr,(self.l,)) - x_ptr[:] = self.rowptr[:-1]*sizeof(feature_node)+base - else: - for i, xi in enumerate(self.x_space): self.x[i] = xi - - self.set_bias(bias) - - def set_bias(self, bias): - if self.bias == bias: - return - if bias >= 0 and self.bias < 0: - self.n += 1 - node = feature_node(self.n, bias) - if bias < 0 and self.bias >= 0: - self.n -= 1 - node = feature_node(-1, bias) - - if isinstance(self.x_space, list): - for xi in self.x_space: - xi[-2] = node - else: - self.x_space["index"][self.rowptr[1:]-2] = node.index - self.x_space["value"][self.rowptr[1:]-2] = node.value - - self.bias = bias + _names = ["l", "n", "y", "x", "bias"] + _types = [c_int, c_int, POINTER(c_double), POINTER(POINTER(feature_node)), c_double] + _fields_ = genFields(_names, _types) + + def __init__(self, y, x, bias = -1): + if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))): + raise TypeError("type of y: {0} is not supported!".format(type(y))) + + if isinstance(x, (list, tuple)): + if len(y) != len(x): + raise ValueError("len(y) != len(x)") + elif scipy != None and isinstance(x, (scipy.ndarray, sparse.spmatrix)): + if len(y) != x.shape[0]: + raise ValueError("len(y) != len(x)") + if isinstance(x, scipy.ndarray): + x = scipy.ascontiguousarray(x) # enforce row-major + if isinstance(x, sparse.spmatrix): + x = x.tocsr() + pass + else: + raise TypeError("type of x: {0} is not supported!".format(type(x))) + self.l = l = len(y) + self.bias = -1 + + max_idx = 0 + x_space = self.x_space = [] + if scipy != None and isinstance(x, sparse.csr_matrix): + csr_to_problem(x, self) + max_idx = x.shape[1] + else: + for i, xi in enumerate(x): + tmp_xi, tmp_idx = gen_feature_nodearray(xi) + x_space += [tmp_xi] + max_idx = max(max_idx, tmp_idx) + self.n = max_idx + + self.y = (c_double * l)() + if scipy != None and isinstance(y, scipy.ndarray): + scipy.ctypeslib.as_array(self.y, (self.l,))[:] = y + else: + for i, yi in enumerate(y): self.y[i] = yi + + self.x = (POINTER(feature_node) * l)() + if scipy != None and isinstance(x, sparse.csr_matrix): + base = addressof(self.x_space.ctypes.data_as(POINTER(feature_node))[0]) + x_ptr = cast(self.x, POINTER(c_uint64)) + x_ptr = scipy.ctypeslib.as_array(x_ptr,(self.l,)) + x_ptr[:] = self.rowptr[:-1]*sizeof(feature_node)+base + else: + for i, xi in enumerate(self.x_space): self.x[i] = xi + + self.set_bias(bias) + + def set_bias(self, bias): + if self.bias == bias: + return + if bias >= 0 and self.bias < 0: + self.n += 1 + node = feature_node(self.n, bias) + if bias < 0 and self.bias >= 0: + self.n -= 1 + node = feature_node(-1, bias) + + if isinstance(self.x_space, list): + for xi in self.x_space: + xi[-2] = node + else: + self.x_space["index"][self.rowptr[1:]-2] = node.index + self.x_space["value"][self.rowptr[1:]-2] = node.value + + self.bias = bias class parameter(Structure): - _names = ["solver_type", "eps", "C", "nr_weight", "weight_label", "weight", "p", "nu", "init_sol", "regularize_bias"] - _types = [c_int, c_double, c_double, c_int, POINTER(c_int), POINTER(c_double), c_double, c_double, POINTER(c_double), c_int] - _fields_ = genFields(_names, _types) - - def __init__(self, options = None): - if options == None: - options = '' - self.parse_options(options) - - def __str__(self): - s = '' - attrs = parameter._names + list(self.__dict__.keys()) - values = map(lambda attr: getattr(self, attr), attrs) - for attr, val in zip(attrs, values): - s += (' %s: %s\n' % (attr, val)) - s = s.strip() - - return s - - def set_to_default_values(self): - self.solver_type = L2R_L2LOSS_SVC_DUAL - self.eps = float('inf') - self.C = 1 - self.p = 0.1 - self.nu = 0.5 - self.nr_weight = 0 - self.weight_label = None - self.weight = None - self.init_sol = None - self.bias = -1 - self.regularize_bias = 1 - self.flag_cross_validation = False - self.flag_C_specified = False - self.flag_p_specified = False - self.flag_solver_specified = False - self.flag_find_parameters = False - self.nr_fold = 0 - self.print_func = cast(None, PRINT_STRING_FUN) - - def parse_options(self, options): - if isinstance(options, list): - argv = options - elif isinstance(options, str): - argv = options.split() - else: - raise TypeError("arg 1 should be a list or a str.") - self.set_to_default_values() - self.print_func = cast(None, PRINT_STRING_FUN) - weight_label = [] - weight = [] - - i = 0 - while i < len(argv) : - if argv[i] == "-s": - i = i + 1 - self.solver_type = int(argv[i]) - self.flag_solver_specified = True - elif argv[i] == "-c": - i = i + 1 - self.C = float(argv[i]) - self.flag_C_specified = True - elif argv[i] == "-p": - i = i + 1 - self.p = float(argv[i]) - self.flag_p_specified = True - elif argv[i] == "-n": - i = i + 1 - self.nu = float(argv[i]) - elif argv[i] == "-e": - i = i + 1 - self.eps = float(argv[i]) - elif argv[i] == "-B": - i = i + 1 - self.bias = float(argv[i]) - elif argv[i] == "-v": - i = i + 1 - self.flag_cross_validation = 1 - self.nr_fold = int(argv[i]) - if self.nr_fold < 2 : - raise ValueError("n-fold cross validation: n must >= 2") - elif argv[i].startswith("-w"): - i = i + 1 - self.nr_weight += 1 - weight_label += [int(argv[i-1][2:])] - weight += [float(argv[i])] - elif argv[i] == "-q": - self.print_func = PRINT_STRING_FUN(print_null) - elif argv[i] == "-C": - self.flag_find_parameters = True - elif argv[i] == "-R": - self.regularize_bias = 0 - else: - raise ValueError("Wrong options") - i += 1 - - liblinear.set_print_string_function(self.print_func) - self.weight_label = (c_int*self.nr_weight)() - self.weight = (c_double*self.nr_weight)() - for i in range(self.nr_weight): - self.weight[i] = weight[i] - self.weight_label[i] = weight_label[i] - - # default solver for parameter selection is L2R_L2LOSS_SVC - if self.flag_find_parameters: - if not self.flag_cross_validation: - self.nr_fold = 5 - if not self.flag_solver_specified: - self.solver_type = L2R_L2LOSS_SVC - self.flag_solver_specified = True - elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC, L2R_L2LOSS_SVR]: - raise ValueError("Warm-start parameter search only available for -s 0, -s 2 and -s 11") - - if self.eps == float('inf'): - if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: - self.eps = 0.01 - elif self.solver_type in [L2R_L2LOSS_SVR]: - self.eps = 0.0001 - elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]: - self.eps = 0.1 - elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]: - self.eps = 0.01 - elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: - self.eps = 0.1 - elif self.solver_type in [ONECLASS_SVM]: - self.eps = 0.01 + _names = ["solver_type", "eps", "C", "nr_weight", "weight_label", "weight", "p", "nu", "init_sol", "regularize_bias"] + _types = [c_int, c_double, c_double, c_int, POINTER(c_int), POINTER(c_double), c_double, c_double, POINTER(c_double), c_int] + _fields_ = genFields(_names, _types) + + def __init__(self, options = None): + if options == None: + options = '' + self.parse_options(options) + + def __str__(self): + s = '' + attrs = parameter._names + list(self.__dict__.keys()) + values = map(lambda attr: getattr(self, attr), attrs) + for attr, val in zip(attrs, values): + s += (' %s: %s\n' % (attr, val)) + s = s.strip() + + return s + + def set_to_default_values(self): + self.solver_type = L2R_L2LOSS_SVC_DUAL + self.eps = float('inf') + self.C = 1 + self.p = 0.1 + self.nu = 0.5 + self.nr_weight = 0 + self.weight_label = None + self.weight = None + self.init_sol = None + self.bias = -1 + self.regularize_bias = 1 + self.flag_cross_validation = False + self.flag_C_specified = False + self.flag_p_specified = False + self.flag_solver_specified = False + self.flag_find_parameters = False + self.nr_fold = 0 + self.print_func = cast(None, PRINT_STRING_FUN) + + def parse_options(self, options): + if isinstance(options, list): + argv = options + elif isinstance(options, str): + argv = options.split() + else: + raise TypeError("arg 1 should be a list or a str.") + self.set_to_default_values() + self.print_func = cast(None, PRINT_STRING_FUN) + weight_label = [] + weight = [] + + i = 0 + while i < len(argv) : + if argv[i] == "-s": + i = i + 1 + self.solver_type = int(argv[i]) + self.flag_solver_specified = True + elif argv[i] == "-c": + i = i + 1 + self.C = float(argv[i]) + self.flag_C_specified = True + elif argv[i] == "-p": + i = i + 1 + self.p = float(argv[i]) + self.flag_p_specified = True + elif argv[i] == "-n": + i = i + 1 + self.nu = float(argv[i]) + elif argv[i] == "-e": + i = i + 1 + self.eps = float(argv[i]) + elif argv[i] == "-B": + i = i + 1 + self.bias = float(argv[i]) + elif argv[i] == "-v": + i = i + 1 + self.flag_cross_validation = 1 + self.nr_fold = int(argv[i]) + if self.nr_fold < 2 : + raise ValueError("n-fold cross validation: n must >= 2") + elif argv[i].startswith("-w"): + i = i + 1 + self.nr_weight += 1 + weight_label += [int(argv[i-1][2:])] + weight += [float(argv[i])] + elif argv[i] == "-q": + self.print_func = PRINT_STRING_FUN(print_null) + elif argv[i] == "-C": + self.flag_find_parameters = True + elif argv[i] == "-R": + self.regularize_bias = 0 + else: + raise ValueError("Wrong options") + i += 1 + + liblinear.set_print_string_function(self.print_func) + self.weight_label = (c_int*self.nr_weight)() + self.weight = (c_double*self.nr_weight)() + for i in range(self.nr_weight): + self.weight[i] = weight[i] + self.weight_label[i] = weight_label[i] + + # default solver for parameter selection is L2R_L2LOSS_SVC + if self.flag_find_parameters: + if not self.flag_cross_validation: + self.nr_fold = 5 + if not self.flag_solver_specified: + self.solver_type = L2R_L2LOSS_SVC + self.flag_solver_specified = True + elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC, L2R_L2LOSS_SVR]: + raise ValueError("Warm-start parameter search only available for -s 0, -s 2 and -s 11") + + if self.eps == float('inf'): + if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: + self.eps = 0.01 + elif self.solver_type in [L2R_L2LOSS_SVR]: + self.eps = 0.0001 + elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]: + self.eps = 0.1 + elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]: + self.eps = 0.01 + elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: + self.eps = 0.1 + elif self.solver_type in [ONECLASS_SVM]: + self.eps = 0.01 class model(Structure): - _names = ["param", "nr_class", "nr_feature", "w", "label", "bias", "rho"] - _types = [parameter, c_int, c_int, POINTER(c_double), POINTER(c_int), c_double, c_double] - _fields_ = genFields(_names, _types) + _names = ["param", "nr_class", "nr_feature", "w", "label", "bias", "rho"] + _types = [parameter, c_int, c_int, POINTER(c_double), POINTER(c_int), c_double, c_double] + _fields_ = genFields(_names, _types) - def __init__(self): - self.__createfrom__ = 'python' + def __init__(self): + self.__createfrom__ = 'python' - def __del__(self): - # free memory created by C to avoid memory leak - if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C': - liblinear.free_and_destroy_model(pointer(self)) + def __del__(self): + # free memory created by C to avoid memory leak + if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C': + liblinear.free_and_destroy_model(pointer(self)) - def get_nr_feature(self): - return liblinear.get_nr_feature(self) + def get_nr_feature(self): + return liblinear.get_nr_feature(self) - def get_nr_class(self): - return liblinear.get_nr_class(self) + def get_nr_class(self): + return liblinear.get_nr_class(self) - def get_labels(self): - nr_class = self.get_nr_class() - labels = (c_int * nr_class)() - liblinear.get_labels(self, labels) - return labels[:nr_class] + def get_labels(self): + nr_class = self.get_nr_class() + labels = (c_int * nr_class)() + liblinear.get_labels(self, labels) + return labels[:nr_class] - def get_decfun_coef(self, feat_idx, label_idx=0): - return liblinear.get_decfun_coef(self, feat_idx, label_idx) + def get_decfun_coef(self, feat_idx, label_idx=0): + return liblinear.get_decfun_coef(self, feat_idx, label_idx) - def get_decfun_bias(self, label_idx=0): - return liblinear.get_decfun_bias(self, label_idx) + def get_decfun_bias(self, label_idx=0): + return liblinear.get_decfun_bias(self, label_idx) - def get_decfun_rho(self): - return liblinear.get_decfun_rho(self) + def get_decfun_rho(self): + return liblinear.get_decfun_rho(self) - def get_decfun(self, label_idx=0): - w = [liblinear.get_decfun_coef(self, feat_idx, label_idx) for feat_idx in range(1, self.nr_feature+1)] - if self.is_oneclass_model(): - rho = self.get_decfun_rho() - return (w, -rho) - else: - b = liblinear.get_decfun_bias(self, label_idx) - return (w, b) + def get_decfun(self, label_idx=0): + w = [liblinear.get_decfun_coef(self, feat_idx, label_idx) for feat_idx in range(1, self.nr_feature+1)] + if self.is_oneclass_model(): + rho = self.get_decfun_rho() + return (w, -rho) + else: + b = liblinear.get_decfun_bias(self, label_idx) + return (w, b) - def is_probability_model(self): - return (liblinear.check_probability_model(self) == 1) + def is_probability_model(self): + return (liblinear.check_probability_model(self) == 1) - def is_regression_model(self): - return (liblinear.check_regression_model(self) == 1) + def is_regression_model(self): + return (liblinear.check_regression_model(self) == 1) - def is_oneclass_model(self): - return (liblinear.check_oneclass_model(self) == 1) + def is_oneclass_model(self): + return (liblinear.check_oneclass_model(self) == 1) def toPyModel(model_ptr): - """ - toPyModel(model_ptr) -> model - - Convert a ctypes POINTER(model) to a Python model - """ - if bool(model_ptr) == False: - raise ValueError("Null pointer") - m = model_ptr.contents - m.__createfrom__ = 'C' - return m + """ + toPyModel(model_ptr) -> model + + Convert a ctypes POINTER(model) to a Python model + """ + if bool(model_ptr) == False: + raise ValueError("Null pointer") + m = model_ptr.contents + m.__createfrom__ = 'C' + return m fillprototype(liblinear.train, POINTER(model), [POINTER(problem), POINTER(parameter)]) fillprototype(liblinear.find_parameters, None, [POINTER(problem), POINTER(parameter), c_int, c_double, c_double, POINTER(c_double), POINTER(c_double), POINTER(c_double)]) diff --git a/python/liblinear/liblinearutil.py b/python/liblinear/liblinearutil.py index 073ca41..d5e99f7 100644 --- a/python/liblinear/liblinearutil.py +++ b/python/liblinear/liblinearutil.py @@ -10,273 +10,273 @@ from .commonutil import __all__ as common_all from ctypes import c_double if sys.version_info[0] < 3: - range = xrange - from itertools import izip as zip - _cstr = lambda s: s.encode("utf-8") if isinstance(s,unicode) else str(s) + range = xrange + from itertools import izip as zip + _cstr = lambda s: s.encode("utf-8") if isinstance(s,unicode) else str(s) else: - _cstr = lambda s: bytes(s, "utf-8") + _cstr = lambda s: bytes(s, "utf-8") __all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all def load_model(model_file_name): - """ - load_model(model_file_name) -> model - - Load a LIBLINEAR model from model_file_name and return. - """ - model = liblinear.load_model(_cstr(model_file_name)) - if not model: - print("can't open model file %s" % model_file_name) - return None - model = toPyModel(model) - return model + """ + load_model(model_file_name) -> model + + Load a LIBLINEAR model from model_file_name and return. + """ + model = liblinear.load_model(_cstr(model_file_name)) + if not model: + print("can't open model file %s" % model_file_name) + return None + model = toPyModel(model) + return model def save_model(model_file_name, model): - """ - save_model(model_file_name, model) -> None + """ + save_model(model_file_name, model) -> None - Save a LIBLINEAR model to the file model_file_name. - """ - liblinear.save_model(_cstr(model_file_name), model) + Save a LIBLINEAR model to the file model_file_name. + """ + liblinear.save_model(_cstr(model_file_name), model) def train(arg1, arg2=None, arg3=None): - """ - train(y, x [, options]) -> model | ACC - - y: a list/tuple/ndarray of l true labels (type must be int/double). - - x: 1. a list/tuple of l training instances. Feature vector of - each training instance is a list/tuple or dictionary. - - 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). - - train(prob [, options]) -> model | ACC - train(prob, param) -> model | ACC - - Train a model from data (y, x) or a problem prob using - 'options' or a parameter param. - - If '-v' is specified in 'options' (i.e., cross validation) - either accuracy (ACC) or mean-squared error (MSE) is returned. - - options: - -s type : set type of solver (default 1) - for multi-class classification - 0 -- L2-regularized logistic regression (primal) - 1 -- L2-regularized L2-loss support vector classification (dual) - 2 -- L2-regularized L2-loss support vector classification (primal) - 3 -- L2-regularized L1-loss support vector classification (dual) - 4 -- support vector classification by Crammer and Singer - 5 -- L1-regularized L2-loss support vector classification - 6 -- L1-regularized logistic regression - 7 -- L2-regularized logistic regression (dual) - for regression - 11 -- L2-regularized L2-loss support vector regression (primal) - 12 -- L2-regularized L2-loss support vector regression (dual) - 13 -- L2-regularized L1-loss support vector regression (dual) - for outlier detection - 21 -- one-class support vector machine (dual) - -c cost : set the parameter C (default 1) - -p epsilon : set the epsilon in loss function of SVR (default 0.1) - -e epsilon : set tolerance of termination criterion - -s 0 and 2 - |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, - where f is the primal function, (default 0.01) - -s 11 - |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001) - -s 1, 3, 4, 7, and 21 - Dual maximal violation <= eps; similar to libsvm (default 0.1 except 0.01 for -s 21) - -s 5 and 6 - |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, - where f is the primal function (default 0.01) - -s 12 and 13 - |f'(alpha)|_1 <= eps |f'(alpha0)|, - where f is the dual function (default 0.1) - -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) - -R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is - (for -s 0, 2, 5, 6, 11)" - -wi weight: weights adjust the parameter C of different classes (see README for details) - -v n: n-fold cross validation mode - -C : find parameters (C for -s 0, 2 and C, p for -s 11) - -q : quiet mode (no outputs) - """ - prob, param = None, None - if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)): - assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix))) - y, x, options = arg1, arg2, arg3 - prob = problem(y, x) - param = parameter(options) - elif isinstance(arg1, problem): - prob = arg1 - if isinstance(arg2, parameter): - param = arg2 - else: - param = parameter(arg2) - if prob == None or param == None : - raise TypeError("Wrong types for the arguments") - - prob.set_bias(param.bias) - liblinear.set_print_string_function(param.print_func) - err_msg = liblinear.check_parameter(prob, param) - if err_msg : - raise ValueError('Error: %s' % err_msg) - - if param.flag_find_parameters: - nr_fold = param.nr_fold - best_C = c_double() - best_p = c_double() - best_score = c_double() - if param.flag_C_specified: - start_C = param.C - else: - start_C = -1.0 - if param.flag_p_specified: - start_p = param.p - else: - start_p = -1.0 - liblinear.find_parameters(prob, param, nr_fold, start_C, start_p, best_C, best_p, best_score) - if param.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: - print("Best C = %g CV accuracy = %g%%\n"% (best_C.value, 100.0*best_score.value)) - elif param.solver_type in [L2R_L2LOSS_SVR]: - print("Best C = %g Best p = %g CV MSE = %g\n"% (best_C.value, best_p.value, best_score.value)) - return best_C.value,best_p.value,best_score.value - - - elif param.flag_cross_validation: - l, nr_fold = prob.l, param.nr_fold - target = (c_double * l)() - liblinear.cross_validation(prob, param, nr_fold, target) - ACC, MSE, SCC = evaluations(prob.y[:l], target[:l]) - if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: - print("Cross Validation Mean squared error = %g" % MSE) - print("Cross Validation Squared correlation coefficient = %g" % SCC) - return MSE - else: - print("Cross Validation Accuracy = %g%%" % ACC) - return ACC - else: - m = liblinear.train(prob, param) - m = toPyModel(m) - - return m + """ + train(y, x [, options]) -> model | ACC + + y: a list/tuple/ndarray of l true labels (type must be int/double). + + x: 1. a list/tuple of l training instances. Feature vector of + each training instance is a list/tuple or dictionary. + + 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). + + train(prob [, options]) -> model | ACC + train(prob, param) -> model | ACC + + Train a model from data (y, x) or a problem prob using + 'options' or a parameter param. + + If '-v' is specified in 'options' (i.e., cross validation) + either accuracy (ACC) or mean-squared error (MSE) is returned. + + options: + -s type : set type of solver (default 1) + for multi-class classification + 0 -- L2-regularized logistic regression (primal) + 1 -- L2-regularized L2-loss support vector classification (dual) + 2 -- L2-regularized L2-loss support vector classification (primal) + 3 -- L2-regularized L1-loss support vector classification (dual) + 4 -- support vector classification by Crammer and Singer + 5 -- L1-regularized L2-loss support vector classification + 6 -- L1-regularized logistic regression + 7 -- L2-regularized logistic regression (dual) + for regression + 11 -- L2-regularized L2-loss support vector regression (primal) + 12 -- L2-regularized L2-loss support vector regression (dual) + 13 -- L2-regularized L1-loss support vector regression (dual) + for outlier detection + 21 -- one-class support vector machine (dual) + -c cost : set the parameter C (default 1) + -p epsilon : set the epsilon in loss function of SVR (default 0.1) + -e epsilon : set tolerance of termination criterion + -s 0 and 2 + |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, + where f is the primal function, (default 0.01) + -s 11 + |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001) + -s 1, 3, 4, 7, and 21 + Dual maximal violation <= eps; similar to libsvm (default 0.1 except 0.01 for -s 21) + -s 5 and 6 + |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, + where f is the primal function (default 0.01) + -s 12 and 13 + |f'(alpha)|_1 <= eps |f'(alpha0)|, + where f is the dual function (default 0.1) + -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) + -R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is + (for -s 0, 2, 5, 6, 11)" + -wi weight: weights adjust the parameter C of different classes (see README for details) + -v n: n-fold cross validation mode + -C : find parameters (C for -s 0, 2 and C, p for -s 11) + -q : quiet mode (no outputs) + """ + prob, param = None, None + if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)): + assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix))) + y, x, options = arg1, arg2, arg3 + prob = problem(y, x) + param = parameter(options) + elif isinstance(arg1, problem): + prob = arg1 + if isinstance(arg2, parameter): + param = arg2 + else: + param = parameter(arg2) + if prob == None or param == None : + raise TypeError("Wrong types for the arguments") + + prob.set_bias(param.bias) + liblinear.set_print_string_function(param.print_func) + err_msg = liblinear.check_parameter(prob, param) + if err_msg : + raise ValueError('Error: %s' % err_msg) + + if param.flag_find_parameters: + nr_fold = param.nr_fold + best_C = c_double() + best_p = c_double() + best_score = c_double() + if param.flag_C_specified: + start_C = param.C + else: + start_C = -1.0 + if param.flag_p_specified: + start_p = param.p + else: + start_p = -1.0 + liblinear.find_parameters(prob, param, nr_fold, start_C, start_p, best_C, best_p, best_score) + if param.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: + print("Best C = %g CV accuracy = %g%%\n"% (best_C.value, 100.0*best_score.value)) + elif param.solver_type in [L2R_L2LOSS_SVR]: + print("Best C = %g Best p = %g CV MSE = %g\n"% (best_C.value, best_p.value, best_score.value)) + return best_C.value,best_p.value,best_score.value + + + elif param.flag_cross_validation: + l, nr_fold = prob.l, param.nr_fold + target = (c_double * l)() + liblinear.cross_validation(prob, param, nr_fold, target) + ACC, MSE, SCC = evaluations(prob.y[:l], target[:l]) + if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: + print("Cross Validation Mean squared error = %g" % MSE) + print("Cross Validation Squared correlation coefficient = %g" % SCC) + return MSE + else: + print("Cross Validation Accuracy = %g%%" % ACC) + return ACC + else: + m = liblinear.train(prob, param) + m = toPyModel(m) + + return m def predict(y, x, m, options=""): - """ - predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals) - - y: a list/tuple/ndarray of l true labels (type must be int/double). - It is used for calculating the accuracy. Use [] if true labels are - unavailable. - - x: 1. a list/tuple of l training instances. Feature vector of - each training instance is a list/tuple or dictionary. - - 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). - - Predict data (y, x) with the SVM model m. - options: - -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only - -q quiet mode (no outputs) - - The return tuple contains - p_labels: a list of predicted labels - p_acc: a tuple including accuracy (for classification), mean-squared - error, and squared correlation coefficient (for regression). - p_vals: a list of decision values or probability estimates (if '-b 1' - is specified). If k is the number of classes, for decision values, - each element includes results of predicting k binary-class - SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value - is returned. For probabilities, each element contains k values - indicating the probability that the testing instance is in each class. - Note that the order of classes here is the same as 'model.label' - field in the model structure. - """ - - def info(s): - print(s) - - if scipy and isinstance(x, scipy.ndarray): - x = scipy.ascontiguousarray(x) # enforce row-major - elif sparse and isinstance(x, sparse.spmatrix): - x = x.tocsr() - elif not isinstance(x, (list, tuple)): - raise TypeError("type of x: {0} is not supported!".format(type(x))) - - if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))): - raise TypeError("type of y: {0} is not supported!".format(type(y))) - - predict_probability = 0 - argv = options.split() - i = 0 - while i < len(argv): - if argv[i] == '-b': - i += 1 - predict_probability = int(argv[i]) - elif argv[i] == '-q': - info = print_null - else: - raise ValueError("Wrong options") - i+=1 - - solver_type = m.param.solver_type - nr_class = m.get_nr_class() - nr_feature = m.get_nr_feature() - is_prob_model = m.is_probability_model() - bias = m.bias - if bias >= 0: - biasterm = feature_node(nr_feature+1, bias) - else: - biasterm = feature_node(-1, bias) - pred_labels = [] - pred_values = [] - - if scipy and isinstance(x, sparse.spmatrix): - nr_instance = x.shape[0] - else: - nr_instance = len(x) - - if predict_probability: - if not is_prob_model: - raise TypeError('probability output is only supported for logistic regression') - prob_estimates = (c_double * nr_class)() - for i in range(nr_instance): - if scipy and isinstance(x, sparse.spmatrix): - indslice = slice(x.indptr[i], x.indptr[i+1]) - xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature) - else: - xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature) - xi[-2] = biasterm - label = liblinear.predict_probability(m, xi, prob_estimates) - values = prob_estimates[:nr_class] - pred_labels += [label] - pred_values += [values] - else: - if nr_class <= 2: - nr_classifier = 1 - else: - nr_classifier = nr_class - dec_values = (c_double * nr_classifier)() - for i in range(nr_instance): - if scipy and isinstance(x, sparse.spmatrix): - indslice = slice(x.indptr[i], x.indptr[i+1]) - xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature) - else: - xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature) - xi[-2] = biasterm - label = liblinear.predict_values(m, xi, dec_values) - values = dec_values[:nr_classifier] - pred_labels += [label] - pred_values += [values] - - if len(y) == 0: - y = [0] * nr_instance - ACC, MSE, SCC = evaluations(y, pred_labels) - - if m.is_regression_model(): - info("Mean squared error = %g (regression)" % MSE) - info("Squared correlation coefficient = %g (regression)" % SCC) - else: - info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance)) - - return pred_labels, (ACC, MSE, SCC), pred_values + """ + predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals) + + y: a list/tuple/ndarray of l true labels (type must be int/double). + It is used for calculating the accuracy. Use [] if true labels are + unavailable. + + x: 1. a list/tuple of l training instances. Feature vector of + each training instance is a list/tuple or dictionary. + + 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). + + Predict data (y, x) with the SVM model m. + options: + -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only + -q quiet mode (no outputs) + + The return tuple contains + p_labels: a list of predicted labels + p_acc: a tuple including accuracy (for classification), mean-squared + error, and squared correlation coefficient (for regression). + p_vals: a list of decision values or probability estimates (if '-b 1' + is specified). If k is the number of classes, for decision values, + each element includes results of predicting k binary-class + SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value + is returned. For probabilities, each element contains k values + indicating the probability that the testing instance is in each class. + Note that the order of classes here is the same as 'model.label' + field in the model structure. + """ + + def info(s): + print(s) + + if scipy and isinstance(x, scipy.ndarray): + x = scipy.ascontiguousarray(x) # enforce row-major + elif sparse and isinstance(x, sparse.spmatrix): + x = x.tocsr() + elif not isinstance(x, (list, tuple)): + raise TypeError("type of x: {0} is not supported!".format(type(x))) + + if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))): + raise TypeError("type of y: {0} is not supported!".format(type(y))) + + predict_probability = 0 + argv = options.split() + i = 0 + while i < len(argv): + if argv[i] == '-b': + i += 1 + predict_probability = int(argv[i]) + elif argv[i] == '-q': + info = print_null + else: + raise ValueError("Wrong options") + i+=1 + + solver_type = m.param.solver_type + nr_class = m.get_nr_class() + nr_feature = m.get_nr_feature() + is_prob_model = m.is_probability_model() + bias = m.bias + if bias >= 0: + biasterm = feature_node(nr_feature+1, bias) + else: + biasterm = feature_node(-1, bias) + pred_labels = [] + pred_values = [] + + if scipy and isinstance(x, sparse.spmatrix): + nr_instance = x.shape[0] + else: + nr_instance = len(x) + + if predict_probability: + if not is_prob_model: + raise TypeError('probability output is only supported for logistic regression') + prob_estimates = (c_double * nr_class)() + for i in range(nr_instance): + if scipy and isinstance(x, sparse.spmatrix): + indslice = slice(x.indptr[i], x.indptr[i+1]) + xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature) + else: + xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature) + xi[-2] = biasterm + label = liblinear.predict_probability(m, xi, prob_estimates) + values = prob_estimates[:nr_class] + pred_labels += [label] + pred_values += [values] + else: + if nr_class <= 2: + nr_classifier = 1 + else: + nr_classifier = nr_class + dec_values = (c_double * nr_classifier)() + for i in range(nr_instance): + if scipy and isinstance(x, sparse.spmatrix): + indslice = slice(x.indptr[i], x.indptr[i+1]) + xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature) + else: + xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature) + xi[-2] = biasterm + label = liblinear.predict_values(m, xi, dec_values) + values = dec_values[:nr_classifier] + pred_labels += [label] + pred_values += [values] + + if len(y) == 0: + y = [0] * nr_instance + ACC, MSE, SCC = evaluations(y, pred_labels) + + if m.is_regression_model(): + info("Mean squared error = %g (regression)" % MSE) + info("Squared correlation coefficient = %g (regression)" % SCC) + else: + info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance)) + + return pred_labels, (ACC, MSE, SCC), pred_values