From: Chih-Jen Lin Date: Thu, 26 Jul 2018 15:11:23 +0000 (+0800) Subject: Include LIBSVM's svm-scale.c in the package. X-Git-Tag: v221~7 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=44012bc10f8dc5fc9dbf75b93b4fa3e0e3c29200;p=liblinear Include LIBSVM's svm-scale.c in the package. Add two functions (csr_find_scale_param, csr_scale) for data scaling in python interface. Remove utility functions (svm_read_problem, evaluations) in liblinearutil.py and directly use them from LIBSVM's commonutil.py. Functions for scaling are also put in commonutil.py. --- diff --git a/README b/README index 53a97e4..287edb1 100644 --- a/README +++ b/README @@ -17,6 +17,7 @@ Table of Contents - Installation - `train' Usage - `predict' Usage +- `svm-scale' Usage - Examples - Library Usage - Building Windows Binaries @@ -75,8 +76,8 @@ sparse data, use `-l 0' to keep the sparsity. Installation ============ -On Unix systems, type `make' to build the `train' and `predict' -programs. Run them without arguments to show the usages. +On Unix systems, type `make' to build the `train', `predict', +and `svm-scale' programs. Run them without arguments to show the usages. On other systems, consult `Makefile' to build them (e.g., see 'Building Windows binaries' in this file) or use the pre-built @@ -92,6 +93,8 @@ and mark LIBS ?= blas/blas.a +The tool `svm-scale', borrowed from LIBSVM, is for scaling input data file. + `train' Usage ============= @@ -233,6 +236,11 @@ options: Note that -b is only needed in the prediction phase. This is different from the setting of LIBSVM. +`svm-scale' Usage +================= + +See LIBSVM README. + Examples ======== @@ -437,7 +445,7 @@ in linear.h, so you can check the version number. - Function: void find_parameter_C(const struct problem *prob, const struct parameter *param, int nr_fold, double start_C, - double max_C, double *best_C, double *best_rate); + double max_C, double *best_C, double *best_rate); This function is similar to cross_validation. However, instead of conducting cross validation under a specified parameter C, it @@ -581,8 +589,8 @@ nmake -f Makefile.win clean all nmake -f Makefile.win lib 4. (Optional) To build 32-bit windows binaries, you must - (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat - (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32 + (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat + (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32 MATLAB/OCTAVE Interface ======================= diff --git a/python/README b/python/README index 351e70f..ccc3531 100644 --- a/python/README +++ b/python/README @@ -108,6 +108,11 @@ in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface. >>> param = parameter('-s 0 -c 4 -B 1') >>> m = train(prob, param) +# Apply data scaling in Scipy format +>>> y, x = svm_read_problem('../heart_scale', return_scipy=True) +>>> scale_param = csr_find_scale_param(x, lower=0) +>>> scaled_x = csr_scale(x, scale_param) + # Other utility functions >>> save_model('heart_scale.model', m) >>> m = load_model('heart_scale.model') @@ -193,10 +198,10 @@ LIBLINEAR shared library: y: a Python list/tuple/ndarray of l labels (type must be int/double). - x: 1. a list/tuple of l training instances. Feature vector of - each training instance is a list/tuple or dictionary. + x: 1. a list/tuple of l training instances. Feature vector of + each training instance is a list/tuple or dictionary. - 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). + 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) @@ -310,10 +315,10 @@ The above command loads y: a list/tuple/ndarray of l training labels (type must be int/double). - x: 1. a list/tuple of l training instances. Feature vector of - each training instance is a list/tuple or dictionary. + x: 1. a list/tuple of l training instances. Feature vector of + each training instance is a list/tuple or dictionary. - 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). + 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). training_options: a string in the same form as that for LIBLINEAR command mode. @@ -356,13 +361,13 @@ The above command loads >>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options']) y: a list/tuple/ndarray of l true labels (type must be int/double). - It is used for calculating the accuracy. Use [] if true labels are + It is used for calculating the accuracy. Use [] if true labels are unavailable. - x: 1. a list/tuple of l training instances. Feature vector of - each training instance is a list/tuple or dictionary. + x: 1. a list/tuple of l training instances. Feature vector of + each training instance is a list/tuple or dictionary. - 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). + 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). predicting_options: a string of predicting options in the same format as that of LIBLINEAR. @@ -399,7 +404,7 @@ The above command loads - Function: evaluations - Calculate some evaluations using the true values (ty) and predicted + Calculate some evaluations using the true values (ty) and the predicted values (pv): >>> (ACC, MSE, SCC) = evaluations(ty, pv, useScipy) @@ -416,6 +421,26 @@ The above command loads SCC: squared correlation coefficient. +- Function: csr_find_scale_parameter/csr_scale + + Scale data in csr format. + + >>> param = csr_find_scale_param(x [, lower=l, upper=u]) + >>> x = csr_scale(x, param) + + x: a csr_matrix of data. + + l: x scaling lower limit; default -1. + + u: x scaling upper limit; default 1. + + The scaling process is: x * diag(coef) + ones(l, 1) * offset' + + param: a dictionary of scaling parameters, where param['coef'] = coef and param['offset'] = offset. + + coef: a scipy array of scaling coefficients. + + offset: a scipy array of scaling offsets. Additional Information ====================== diff --git a/python/commonutil.py b/python/commonutil.py new file mode 100644 index 0000000..f2da8de --- /dev/null +++ b/python/commonutil.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +from __future__ import print_function +import sys + +try: + import scipy + from scipy import sparse +except: + scipy = None + sparse = None + + +__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale'] + +def svm_read_problem(data_file_name, return_scipy=False): + """ + svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary + svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix + + Read LIBSVM-format data from data_file_name and return labels y + and data instances x. + """ + prob_y = [] + prob_x = [] + row_ptr = [0] + col_idx = [] + for i, line in enumerate(open(data_file_name)): + line = line.split(None, 1) + # In case an instance with all zero features + if len(line) == 1: line += [''] + label, features = line + prob_y += [float(label)] + if scipy != None and return_scipy: + nz = 0 + for e in features.split(): + ind, val = e.split(":") + val = float(val) + if val != 0: + col_idx += [int(ind)-1] + prob_x += [val] + nz += 1 + row_ptr += [row_ptr[-1]+nz] + else: + xi = {} + for e in features.split(): + ind, val = e.split(":") + xi[int(ind)] = float(val) + prob_x += [xi] + if scipy != None and return_scipy: + prob_y = scipy.array(prob_y) + prob_x = scipy.array(prob_x) + col_idx = scipy.array(col_idx) + row_ptr = scipy.array(row_ptr) + prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) + return (prob_y, prob_x) + +def evaluations_scipy(ty, pv): + """ + evaluations_scipy(ty, pv) -> (ACC, MSE, SCC) + ty, pv: ndarray + + Calculate accuracy, mean squared error and squared correlation coefficient + using the true values (ty) and predicted values (pv). + """ + if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)): + raise TypeError("type of ty and pv must be ndarray") + if len(ty) != len(pv): + raise ValueError("len(ty) must be equal to len(pv)") + ACC = 100.0*(ty == pv).mean() + MSE = ((ty - pv)**2).mean() + l = len(ty) + sumv = pv.sum() + sumy = ty.sum() + sumvy = (pv*ty).sum() + sumvv = (pv*pv).sum() + sumyy = (ty*ty).sum() + with scipy.errstate(all = 'raise'): + try: + SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) + except: + SCC = float('nan') + return (float(ACC), float(MSE), float(SCC)) + +def evaluations(ty, pv, useScipy = True): + """ + evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC) + ty, pv: list, tuple or ndarray + useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation + + Calculate accuracy, mean squared error and squared correlation coefficient + using the true values (ty) and predicted values (pv). + """ + if scipy != None and useScipy: + return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv)) + if len(ty) != len(pv): + raise ValueError("len(ty) must be equal to len(pv)") + total_correct = total_error = 0 + sumv = sumy = sumvv = sumyy = sumvy = 0 + for v, y in zip(pv, ty): + if y == v: + total_correct += 1 + total_error += (v-y)*(v-y) + sumv += v + sumy += y + sumvv += v*v + sumyy += y*y + sumvy += v*y + l = len(ty) + ACC = 100.0*total_correct/l + MSE = total_error/l + try: + SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) + except: + SCC = float('nan') + return (float(ACC), float(MSE), float(SCC)) + +def csr_find_scale_param(x, lower=-1, upper=1): + assert isinstance(x, sparse.csr_matrix) + assert lower < upper + l, n = x.shape + feat_min = x.min(axis=0).toarray().flatten() + feat_max = x.max(axis=0).toarray().flatten() + coef = (feat_max - feat_min) / (upper - lower) + coef[coef != 0] = 1.0 / coef[coef != 0] + + # (x - ones(l,1) * feat_min') * diag(coef) + lower + # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower + # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower) + # = x * diag(coef) + ones(l, 1) * offset' + offset = -feat_min * coef + lower + offset[coef == 0] = 0 + + if sum(offset != 0) * l > 3 * x.getnnz(): + print( + "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n" + "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.", + file=sys.stderr) + + return {'coef':coef, 'offset':offset} + +def csr_scale(x, scale_param): + assert isinstance(x, sparse.csr_matrix) + + offset = scale_param['offset'] + coef = scale_param['coef'] + assert len(coef) == len(offset) + + l, n = x.shape + + if not n == len(coef): + print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr) + coef = resize(coef, n) + offset = resize(offset, n) + + # scaled_x = x * diag(coef) + ones(l, 1) * offset' + offset = sparse.csr_matrix(offset.reshape(1, n)) + offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype) + scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset + + if scaled_x.getnnz() > x.getnnz(): + print( + "WARNING: original #nonzeros %d\n" % x.getnnz() + + " > new #nonzeros %d\n" % scaled_x.getnnz() + + "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.", + file=sys.stderr) diff --git a/python/liblinearutil.py b/python/liblinearutil.py index d3f3015..9a07f59 100644 --- a/python/liblinearutil.py +++ b/python/liblinearutil.py @@ -5,58 +5,17 @@ sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path from liblinear import * from liblinear import __all__ as liblinear_all from liblinear import scipy, sparse +from commonutil import * +from commonutil import __all__ as common_all from ctypes import c_double if sys.version_info[0] < 3: range = xrange from itertools import izip as zip -__all__ = ['svm_read_problem', 'load_model', 'save_model', 'evaluations', - 'train', 'predict'] + liblinear_all +__all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all -def svm_read_problem(data_file_name, return_scipy=False): - """ - svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary - svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix - - Read LIBSVM-format data from data_file_name and return labels y - and data instances x. - """ - prob_y = [] - prob_x = [] - row_ptr = [0] - col_idx = [] - for i, line in enumerate(open(data_file_name)): - line = line.split(None, 1) - # In case an instance with all zero features - if len(line) == 1: line += [''] - label, features = line - prob_y += [float(label)] - if scipy != None and return_scipy: - nz = 0 - for e in features.split(): - ind, val = e.split(":") - val = float(val) - if val != 0: - col_idx += [int(ind)-1] - prob_x += [val] - nz += 1 - row_ptr += [row_ptr[-1]+nz] - else: - xi = {} - for e in features.split(): - ind, val = e.split(":") - xi[int(ind)] = float(val) - prob_x += [xi] - if scipy != None and return_scipy: - prob_y = scipy.array(prob_y) - prob_x = scipy.array(prob_x) - col_idx = scipy.array(col_idx) - row_ptr = scipy.array(row_ptr) - prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) - return (prob_y, prob_x) - def load_model(model_file_name): """ load_model(model_file_name) -> model @@ -78,66 +37,6 @@ def save_model(model_file_name, model): """ liblinear.save_model(model_file_name.encode(), model) -def evaluations_scipy(ty, pv): - """ - evaluations_scipy(ty, pv) -> (ACC, MSE, SCC) - ty, pv: ndarray - - Calculate accuracy, mean squared error and squared correlation coefficient - using the true values (ty) and predicted values (pv). - """ - if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)): - raise TypeError("type of ty and pv must be ndarray") - if len(ty) != len(pv): - raise ValueError("len(ty) must be equal to len(pv)") - ACC = 100.0*(ty == pv).mean() - MSE = ((ty - pv)**2).mean() - l = len(ty) - sumv = pv.sum() - sumy = ty.sum() - sumvy = (pv*ty).sum() - sumvv = (pv*pv).sum() - sumyy = (ty*ty).sum() - with scipy.errstate(all = 'raise'): - try: - SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) - except: - SCC = float('nan') - return (float(ACC), float(MSE), float(SCC)) - -def evaluations(ty, pv, useScipy = True): - """ - evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC) - ty, pv: list, tuple or ndarray - useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation - - Calculate accuracy, mean squared error and squared correlation coefficient - using the true values (ty) and predicted values (pv). - """ - if scipy != None and useScipy: - return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv)) - if len(ty) != len(pv): - raise ValueError("len(ty) must be equal to len(pv)") - total_correct = total_error = 0 - sumv = sumy = sumvv = sumyy = sumvy = 0 - for v, y in zip(pv, ty): - if y == v: - total_correct += 1 - total_error += (v-y)*(v-y) - sumv += v - sumy += y - sumvv += v*v - sumyy += y*y - sumvy += v*y - l = len(ty) - ACC = 100.0*total_correct/l - MSE = total_error/l - try: - SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) - except: - SCC = float('nan') - return (float(ACC), float(MSE), float(SCC)) - def train(arg1, arg2=None, arg3=None): """ train(y, x [, options]) -> model | ACC diff --git a/svm-scale.c b/svm-scale.c new file mode 100644 index 0000000..63d1677 --- /dev/null +++ b/svm-scale.c @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include + +void exit_with_help() +{ + printf( + "Usage: svm-scale [options] data_filename\n" + "options:\n" + "-l lower : x scaling lower limit (default -1)\n" + "-u upper : x scaling upper limit (default +1)\n" + "-y y_lower y_upper : y scaling limits (default: no y scaling)\n" + "-s save_filename : save scaling parameters to save_filename\n" + "-r restore_filename : restore scaling parameters from restore_filename\n" + ); + exit(1); +} + +char *line = NULL; +int max_line_len = 1024; +double lower=-1.0,upper=1.0,y_lower,y_upper; +int y_scaling = 0; +double *feature_max; +double *feature_min; +double y_max = -DBL_MAX; +double y_min = DBL_MAX; +int max_index; +int min_index; +long int num_nonzeros = 0; +long int new_num_nonzeros = 0; + +#define max(x,y) (((x)>(y))?(x):(y)) +#define min(x,y) (((x)<(y))?(x):(y)) + +void output_target(double value); +void output(int index, double value); +char* readline(FILE *input); +int clean_up(FILE *fp_restore, FILE *fp, const char *msg); + +int main(int argc,char **argv) +{ + int i,index; + FILE *fp, *fp_restore = NULL; + char *save_filename = NULL; + char *restore_filename = NULL; + + for(i=1;i lower) || (y_scaling && !(y_upper > y_lower))) + { + fprintf(stderr,"inconsistent lower/upper specification\n"); + exit(1); + } + + if(restore_filename && save_filename) + { + fprintf(stderr,"cannot use -r and -s simultaneously\n"); + exit(1); + } + + if(argc != i+1) + exit_with_help(); + + fp=fopen(argv[i],"r"); + + if(fp==NULL) + { + fprintf(stderr,"can't open file %s\n", argv[i]); + exit(1); + } + + line = (char *) malloc(max_line_len*sizeof(char)); + +#define SKIP_TARGET\ + while(isspace(*p)) ++p;\ + while(!isspace(*p)) ++p; + +#define SKIP_ELEMENT\ + while(*p!=':') ++p;\ + ++p;\ + while(isspace(*p)) ++p;\ + while(*p && !isspace(*p)) ++p; + + /* assumption: min index of attributes is 1 */ + /* pass 1: find out max index of attributes */ + max_index = 0; + min_index = 1; + + if(restore_filename) + { + int idx, c; + + fp_restore = fopen(restore_filename,"r"); + if(fp_restore==NULL) + { + fprintf(stderr,"can't open file %s\n", restore_filename); + exit(1); + } + + c = fgetc(fp_restore); + if(c == 'y') + { + readline(fp_restore); + readline(fp_restore); + readline(fp_restore); + } + readline(fp_restore); + readline(fp_restore); + + while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1) + max_index = max(idx,max_index); + rewind(fp_restore); + } + + while(readline(fp)!=NULL) + { + char *p=line; + + SKIP_TARGET + + while(sscanf(p,"%d:%*f",&index)==1) + { + max_index = max(max_index, index); + min_index = min(min_index, index); + SKIP_ELEMENT + num_nonzeros++; + } + } + + if(min_index < 1) + fprintf(stderr, + "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index); + + rewind(fp); + + feature_max = (double *)malloc((max_index+1)* sizeof(double)); + feature_min = (double *)malloc((max_index+1)* sizeof(double)); + + if(feature_max == NULL || feature_min == NULL) + { + fprintf(stderr,"can't allocate enough memory\n"); + exit(1); + } + + for(i=0;i<=max_index;i++) + { + feature_max[i]=-DBL_MAX; + feature_min[i]=DBL_MAX; + } + + /* pass 2: find out min/max value */ + while(readline(fp)!=NULL) + { + char *p=line; + int next_index=1; + double target; + double value; + + if (sscanf(p,"%lf",&target) != 1) + return clean_up(fp_restore, fp, "ERROR: failed to read labels\n"); + y_max = max(y_max,target); + y_min = min(y_min,target); + + SKIP_TARGET + + while(sscanf(p,"%d:%lf",&index,&value)==2) + { + for(i=next_index;i num_nonzeros) + fprintf(stderr, + "WARNING: original #nonzeros %ld\n" + " > new #nonzeros %ld\n" + "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n", + num_nonzeros, new_num_nonzeros); + + free(line); + free(feature_max); + free(feature_min); + fclose(fp); + return 0; +} + +char* readline(FILE *input) +{ + int len; + + if(fgets(line,max_line_len,input) == NULL) + return NULL; + + while(strrchr(line,'\n') == NULL) + { + max_line_len *= 2; + line = (char *) realloc(line, max_line_len); + len = (int) strlen(line); + if(fgets(line+len,max_line_len-len,input) == NULL) + break; + } + return line; +} + +void output_target(double value) +{ + if(y_scaling) + { + if(value == y_min) + value = y_lower; + else if(value == y_max) + value = y_upper; + else value = y_lower + (y_upper-y_lower) * + (value - y_min)/(y_max-y_min); + } + printf("%.17g ",value); +} + +void output(int index, double value) +{ + /* skip single-valued attribute */ + if(feature_max[index] == feature_min[index]) + return; + + if(value == feature_min[index]) + value = lower; + else if(value == feature_max[index]) + value = upper; + else + value = lower + (upper-lower) * + (value-feature_min[index])/ + (feature_max[index]-feature_min[index]); + + if(value != 0) + { + printf("%d:%g ",index, value); + new_num_nonzeros++; + } +} + +int clean_up(FILE *fp_restore, FILE *fp, const char* msg) +{ + fprintf(stderr, "%s", msg); + free(line); + free(feature_max); + free(feature_min); + fclose(fp); + if (fp_restore) + fclose(fp_restore); + return -1; +} +