Add two functions (csr_find_scale_param, csr_scale) for data scaling in python interface.
Remove utility functions (svm_read_problem, evaluations) in liblinearutil.py and directly use them from LIBSVM's commonutil.py. Functions for scaling are also put in commonutil.py.
- Installation
- `train' Usage
- `predict' Usage
+- `svm-scale' Usage
- Examples
- Library Usage
- Building Windows Binaries
Installation
============
-On Unix systems, type `make' to build the `train' and `predict'
-programs. Run them without arguments to show the usages.
+On Unix systems, type `make' to build the `train', `predict',
+and `svm-scale' programs. Run them without arguments to show the usages.
On other systems, consult `Makefile' to build them (e.g., see
'Building Windows binaries' in this file) or use the pre-built
LIBS ?= blas/blas.a
+The tool `svm-scale', borrowed from LIBSVM, is for scaling input data file.
+
`train' Usage
=============
Note that -b is only needed in the prediction phase. This is different
from the setting of LIBSVM.
+`svm-scale' Usage
+=================
+
+See LIBSVM README.
+
Examples
========
- Function: void find_parameter_C(const struct problem *prob,
const struct parameter *param, int nr_fold, double start_C,
- double max_C, double *best_C, double *best_rate);
+ double max_C, double *best_C, double *best_rate);
This function is similar to cross_validation. However, instead of
conducting cross validation under a specified parameter C, it
nmake -f Makefile.win lib
4. (Optional) To build 32-bit windows binaries, you must
- (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat
- (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32
+ (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat
+ (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32
MATLAB/OCTAVE Interface
=======================
>>> param = parameter('-s 0 -c 4 -B 1')
>>> m = train(prob, param)
+# Apply data scaling in Scipy format
+>>> y, x = svm_read_problem('../heart_scale', return_scipy=True)
+>>> scale_param = csr_find_scale_param(x, lower=0)
+>>> scaled_x = csr_scale(x, scale_param)
+
# Other utility functions
>>> save_model('heart_scale.model', m)
>>> m = load_model('heart_scale.model')
y: a Python list/tuple/ndarray of l labels (type must be int/double).
- x: 1. a list/tuple of l training instances. Feature vector of
- each training instance is a list/tuple or dictionary.
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
- 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term
added (default -1)
y: a list/tuple/ndarray of l training labels (type must be int/double).
- x: 1. a list/tuple of l training instances. Feature vector of
- each training instance is a list/tuple or dictionary.
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
- 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
training_options: a string in the same form as that for LIBLINEAR command
mode.
>>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options'])
y: a list/tuple/ndarray of l true labels (type must be int/double).
- It is used for calculating the accuracy. Use [] if true labels are
+ It is used for calculating the accuracy. Use [] if true labels are
unavailable.
- x: 1. a list/tuple of l training instances. Feature vector of
- each training instance is a list/tuple or dictionary.
+ x: 1. a list/tuple of l training instances. Feature vector of
+ each training instance is a list/tuple or dictionary.
- 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+ 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
predicting_options: a string of predicting options in the same format as
that of LIBLINEAR.
- Function: evaluations
- Calculate some evaluations using the true values (ty) and predicted
+ Calculate some evaluations using the true values (ty) and the predicted
values (pv):
>>> (ACC, MSE, SCC) = evaluations(ty, pv, useScipy)
SCC: squared correlation coefficient.
+- Function: csr_find_scale_parameter/csr_scale
+
+ Scale data in csr format.
+
+ >>> param = csr_find_scale_param(x [, lower=l, upper=u])
+ >>> x = csr_scale(x, param)
+
+ x: a csr_matrix of data.
+
+ l: x scaling lower limit; default -1.
+
+ u: x scaling upper limit; default 1.
+
+ The scaling process is: x * diag(coef) + ones(l, 1) * offset'
+
+ param: a dictionary of scaling parameters, where param['coef'] = coef and param['offset'] = offset.
+
+ coef: a scipy array of scaling coefficients.
+
+ offset: a scipy array of scaling offsets.
Additional Information
======================
--- /dev/null
+#!/usr/bin/env python
+
+from __future__ import print_function
+import sys
+
+try:
+ import scipy
+ from scipy import sparse
+except:
+ scipy = None
+ sparse = None
+
+
+__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
+
+def svm_read_problem(data_file_name, return_scipy=False):
+ """
+ svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
+ svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix
+
+ Read LIBSVM-format data from data_file_name and return labels y
+ and data instances x.
+ """
+ prob_y = []
+ prob_x = []
+ row_ptr = [0]
+ col_idx = []
+ for i, line in enumerate(open(data_file_name)):
+ line = line.split(None, 1)
+ # In case an instance with all zero features
+ if len(line) == 1: line += ['']
+ label, features = line
+ prob_y += [float(label)]
+ if scipy != None and return_scipy:
+ nz = 0
+ for e in features.split():
+ ind, val = e.split(":")
+ val = float(val)
+ if val != 0:
+ col_idx += [int(ind)-1]
+ prob_x += [val]
+ nz += 1
+ row_ptr += [row_ptr[-1]+nz]
+ else:
+ xi = {}
+ for e in features.split():
+ ind, val = e.split(":")
+ xi[int(ind)] = float(val)
+ prob_x += [xi]
+ if scipy != None and return_scipy:
+ prob_y = scipy.array(prob_y)
+ prob_x = scipy.array(prob_x)
+ col_idx = scipy.array(col_idx)
+ row_ptr = scipy.array(row_ptr)
+ prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
+ return (prob_y, prob_x)
+
+def evaluations_scipy(ty, pv):
+ """
+ evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
+ ty, pv: ndarray
+
+ Calculate accuracy, mean squared error and squared correlation coefficient
+ using the true values (ty) and predicted values (pv).
+ """
+ if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
+ raise TypeError("type of ty and pv must be ndarray")
+ if len(ty) != len(pv):
+ raise ValueError("len(ty) must be equal to len(pv)")
+ ACC = 100.0*(ty == pv).mean()
+ MSE = ((ty - pv)**2).mean()
+ l = len(ty)
+ sumv = pv.sum()
+ sumy = ty.sum()
+ sumvy = (pv*ty).sum()
+ sumvv = (pv*pv).sum()
+ sumyy = (ty*ty).sum()
+ with scipy.errstate(all = 'raise'):
+ try:
+ SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+ except:
+ SCC = float('nan')
+ return (float(ACC), float(MSE), float(SCC))
+
+def evaluations(ty, pv, useScipy = True):
+ """
+ evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
+ ty, pv: list, tuple or ndarray
+ useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
+
+ Calculate accuracy, mean squared error and squared correlation coefficient
+ using the true values (ty) and predicted values (pv).
+ """
+ if scipy != None and useScipy:
+ return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
+ if len(ty) != len(pv):
+ raise ValueError("len(ty) must be equal to len(pv)")
+ total_correct = total_error = 0
+ sumv = sumy = sumvv = sumyy = sumvy = 0
+ for v, y in zip(pv, ty):
+ if y == v:
+ total_correct += 1
+ total_error += (v-y)*(v-y)
+ sumv += v
+ sumy += y
+ sumvv += v*v
+ sumyy += y*y
+ sumvy += v*y
+ l = len(ty)
+ ACC = 100.0*total_correct/l
+ MSE = total_error/l
+ try:
+ SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+ except:
+ SCC = float('nan')
+ return (float(ACC), float(MSE), float(SCC))
+
+def csr_find_scale_param(x, lower=-1, upper=1):
+ assert isinstance(x, sparse.csr_matrix)
+ assert lower < upper
+ l, n = x.shape
+ feat_min = x.min(axis=0).toarray().flatten()
+ feat_max = x.max(axis=0).toarray().flatten()
+ coef = (feat_max - feat_min) / (upper - lower)
+ coef[coef != 0] = 1.0 / coef[coef != 0]
+
+ # (x - ones(l,1) * feat_min') * diag(coef) + lower
+ # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
+ # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
+ # = x * diag(coef) + ones(l, 1) * offset'
+ offset = -feat_min * coef + lower
+ offset[coef == 0] = 0
+
+ if sum(offset != 0) * l > 3 * x.getnnz():
+ print(
+ "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
+ "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
+ file=sys.stderr)
+
+ return {'coef':coef, 'offset':offset}
+
+def csr_scale(x, scale_param):
+ assert isinstance(x, sparse.csr_matrix)
+
+ offset = scale_param['offset']
+ coef = scale_param['coef']
+ assert len(coef) == len(offset)
+
+ l, n = x.shape
+
+ if not n == len(coef):
+ print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
+ coef = resize(coef, n)
+ offset = resize(offset, n)
+
+ # scaled_x = x * diag(coef) + ones(l, 1) * offset'
+ offset = sparse.csr_matrix(offset.reshape(1, n))
+ offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
+ scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
+
+ if scaled_x.getnnz() > x.getnnz():
+ print(
+ "WARNING: original #nonzeros %d\n" % x.getnnz() +
+ " > new #nonzeros %d\n" % scaled_x.getnnz() +
+ "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
+ file=sys.stderr)
from liblinear import *
from liblinear import __all__ as liblinear_all
from liblinear import scipy, sparse
+from commonutil import *
+from commonutil import __all__ as common_all
from ctypes import c_double
if sys.version_info[0] < 3:
range = xrange
from itertools import izip as zip
-__all__ = ['svm_read_problem', 'load_model', 'save_model', 'evaluations',
- 'train', 'predict'] + liblinear_all
+__all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
-def svm_read_problem(data_file_name, return_scipy=False):
- """
- svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
- svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix
-
- Read LIBSVM-format data from data_file_name and return labels y
- and data instances x.
- """
- prob_y = []
- prob_x = []
- row_ptr = [0]
- col_idx = []
- for i, line in enumerate(open(data_file_name)):
- line = line.split(None, 1)
- # In case an instance with all zero features
- if len(line) == 1: line += ['']
- label, features = line
- prob_y += [float(label)]
- if scipy != None and return_scipy:
- nz = 0
- for e in features.split():
- ind, val = e.split(":")
- val = float(val)
- if val != 0:
- col_idx += [int(ind)-1]
- prob_x += [val]
- nz += 1
- row_ptr += [row_ptr[-1]+nz]
- else:
- xi = {}
- for e in features.split():
- ind, val = e.split(":")
- xi[int(ind)] = float(val)
- prob_x += [xi]
- if scipy != None and return_scipy:
- prob_y = scipy.array(prob_y)
- prob_x = scipy.array(prob_x)
- col_idx = scipy.array(col_idx)
- row_ptr = scipy.array(row_ptr)
- prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
- return (prob_y, prob_x)
-
def load_model(model_file_name):
"""
load_model(model_file_name) -> model
"""
liblinear.save_model(model_file_name.encode(), model)
-def evaluations_scipy(ty, pv):
- """
- evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
- ty, pv: ndarray
-
- Calculate accuracy, mean squared error and squared correlation coefficient
- using the true values (ty) and predicted values (pv).
- """
- if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
- raise TypeError("type of ty and pv must be ndarray")
- if len(ty) != len(pv):
- raise ValueError("len(ty) must be equal to len(pv)")
- ACC = 100.0*(ty == pv).mean()
- MSE = ((ty - pv)**2).mean()
- l = len(ty)
- sumv = pv.sum()
- sumy = ty.sum()
- sumvy = (pv*ty).sum()
- sumvv = (pv*pv).sum()
- sumyy = (ty*ty).sum()
- with scipy.errstate(all = 'raise'):
- try:
- SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
- except:
- SCC = float('nan')
- return (float(ACC), float(MSE), float(SCC))
-
-def evaluations(ty, pv, useScipy = True):
- """
- evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
- ty, pv: list, tuple or ndarray
- useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
-
- Calculate accuracy, mean squared error and squared correlation coefficient
- using the true values (ty) and predicted values (pv).
- """
- if scipy != None and useScipy:
- return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
- if len(ty) != len(pv):
- raise ValueError("len(ty) must be equal to len(pv)")
- total_correct = total_error = 0
- sumv = sumy = sumvv = sumyy = sumvy = 0
- for v, y in zip(pv, ty):
- if y == v:
- total_correct += 1
- total_error += (v-y)*(v-y)
- sumv += v
- sumy += y
- sumvv += v*v
- sumyy += y*y
- sumvy += v*y
- l = len(ty)
- ACC = 100.0*total_correct/l
- MSE = total_error/l
- try:
- SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
- except:
- SCC = float('nan')
- return (float(ACC), float(MSE), float(SCC))
-
def train(arg1, arg2=None, arg3=None):
"""
train(y, x [, options]) -> model | ACC
--- /dev/null
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+void exit_with_help()
+{
+ printf(
+ "Usage: svm-scale [options] data_filename\n"
+ "options:\n"
+ "-l lower : x scaling lower limit (default -1)\n"
+ "-u upper : x scaling upper limit (default +1)\n"
+ "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
+ "-s save_filename : save scaling parameters to save_filename\n"
+ "-r restore_filename : restore scaling parameters from restore_filename\n"
+ );
+ exit(1);
+}
+
+char *line = NULL;
+int max_line_len = 1024;
+double lower=-1.0,upper=1.0,y_lower,y_upper;
+int y_scaling = 0;
+double *feature_max;
+double *feature_min;
+double y_max = -DBL_MAX;
+double y_min = DBL_MAX;
+int max_index;
+int min_index;
+long int num_nonzeros = 0;
+long int new_num_nonzeros = 0;
+
+#define max(x,y) (((x)>(y))?(x):(y))
+#define min(x,y) (((x)<(y))?(x):(y))
+
+void output_target(double value);
+void output(int index, double value);
+char* readline(FILE *input);
+int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
+
+int main(int argc,char **argv)
+{
+ int i,index;
+ FILE *fp, *fp_restore = NULL;
+ char *save_filename = NULL;
+ char *restore_filename = NULL;
+
+ for(i=1;i<argc;i++)
+ {
+ if(argv[i][0] != '-') break;
+ ++i;
+ switch(argv[i-1][1])
+ {
+ case 'l': lower = atof(argv[i]); break;
+ case 'u': upper = atof(argv[i]); break;
+ case 'y':
+ y_lower = atof(argv[i]);
+ ++i;
+ y_upper = atof(argv[i]);
+ y_scaling = 1;
+ break;
+ case 's': save_filename = argv[i]; break;
+ case 'r': restore_filename = argv[i]; break;
+ default:
+ fprintf(stderr,"unknown option\n");
+ exit_with_help();
+ }
+ }
+
+ if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
+ {
+ fprintf(stderr,"inconsistent lower/upper specification\n");
+ exit(1);
+ }
+
+ if(restore_filename && save_filename)
+ {
+ fprintf(stderr,"cannot use -r and -s simultaneously\n");
+ exit(1);
+ }
+
+ if(argc != i+1)
+ exit_with_help();
+
+ fp=fopen(argv[i],"r");
+
+ if(fp==NULL)
+ {
+ fprintf(stderr,"can't open file %s\n", argv[i]);
+ exit(1);
+ }
+
+ line = (char *) malloc(max_line_len*sizeof(char));
+
+#define SKIP_TARGET\
+ while(isspace(*p)) ++p;\
+ while(!isspace(*p)) ++p;
+
+#define SKIP_ELEMENT\
+ while(*p!=':') ++p;\
+ ++p;\
+ while(isspace(*p)) ++p;\
+ while(*p && !isspace(*p)) ++p;
+
+ /* assumption: min index of attributes is 1 */
+ /* pass 1: find out max index of attributes */
+ max_index = 0;
+ min_index = 1;
+
+ if(restore_filename)
+ {
+ int idx, c;
+
+ fp_restore = fopen(restore_filename,"r");
+ if(fp_restore==NULL)
+ {
+ fprintf(stderr,"can't open file %s\n", restore_filename);
+ exit(1);
+ }
+
+ c = fgetc(fp_restore);
+ if(c == 'y')
+ {
+ readline(fp_restore);
+ readline(fp_restore);
+ readline(fp_restore);
+ }
+ readline(fp_restore);
+ readline(fp_restore);
+
+ while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
+ max_index = max(idx,max_index);
+ rewind(fp_restore);
+ }
+
+ while(readline(fp)!=NULL)
+ {
+ char *p=line;
+
+ SKIP_TARGET
+
+ while(sscanf(p,"%d:%*f",&index)==1)
+ {
+ max_index = max(max_index, index);
+ min_index = min(min_index, index);
+ SKIP_ELEMENT
+ num_nonzeros++;
+ }
+ }
+
+ if(min_index < 1)
+ fprintf(stderr,
+ "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
+
+ rewind(fp);
+
+ feature_max = (double *)malloc((max_index+1)* sizeof(double));
+ feature_min = (double *)malloc((max_index+1)* sizeof(double));
+
+ if(feature_max == NULL || feature_min == NULL)
+ {
+ fprintf(stderr,"can't allocate enough memory\n");
+ exit(1);
+ }
+
+ for(i=0;i<=max_index;i++)
+ {
+ feature_max[i]=-DBL_MAX;
+ feature_min[i]=DBL_MAX;
+ }
+
+ /* pass 2: find out min/max value */
+ while(readline(fp)!=NULL)
+ {
+ char *p=line;
+ int next_index=1;
+ double target;
+ double value;
+
+ if (sscanf(p,"%lf",&target) != 1)
+ return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
+ y_max = max(y_max,target);
+ y_min = min(y_min,target);
+
+ SKIP_TARGET
+
+ while(sscanf(p,"%d:%lf",&index,&value)==2)
+ {
+ for(i=next_index;i<index;i++)
+ {
+ feature_max[i]=max(feature_max[i],0);
+ feature_min[i]=min(feature_min[i],0);
+ }
+
+ feature_max[index]=max(feature_max[index],value);
+ feature_min[index]=min(feature_min[index],value);
+
+ SKIP_ELEMENT
+ next_index=index+1;
+ }
+
+ for(i=next_index;i<=max_index;i++)
+ {
+ feature_max[i]=max(feature_max[i],0);
+ feature_min[i]=min(feature_min[i],0);
+ }
+ }
+
+ rewind(fp);
+
+ /* pass 2.5: save/restore feature_min/feature_max */
+
+ if(restore_filename)
+ {
+ /* fp_restore rewinded in finding max_index */
+ int idx, c;
+ double fmin, fmax;
+ int next_index = 1;
+
+ if((c = fgetc(fp_restore)) == 'y')
+ {
+ if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
+ fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
+ return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
+ y_scaling = 1;
+ }
+ else
+ ungetc(c, fp_restore);
+
+ if (fgetc(fp_restore) == 'x')
+ {
+ if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
+ return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
+ while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
+ {
+ for(i = next_index;i<idx;i++)
+ if(feature_min[i] != feature_max[i])
+ {
+ fprintf(stderr,
+ "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
+ i, argv[argc-1], restore_filename);
+ feature_min[i] = 0;
+ feature_max[i] = 0;
+ }
+
+ feature_min[idx] = fmin;
+ feature_max[idx] = fmax;
+
+ next_index = idx + 1;
+ }
+
+ for(i=next_index;i<=max_index;i++)
+ if(feature_min[i] != feature_max[i])
+ {
+ fprintf(stderr,
+ "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
+ i, argv[argc-1], restore_filename);
+ feature_min[i] = 0;
+ feature_max[i] = 0;
+ }
+ }
+ fclose(fp_restore);
+ }
+
+ if(save_filename)
+ {
+ FILE *fp_save = fopen(save_filename,"w");
+ if(fp_save==NULL)
+ {
+ fprintf(stderr,"can't open file %s\n", save_filename);
+ exit(1);
+ }
+ if(y_scaling)
+ {
+ fprintf(fp_save, "y\n");
+ fprintf(fp_save, "%.17g %.17g\n", y_lower, y_upper);
+ fprintf(fp_save, "%.17g %.17g\n", y_min, y_max);
+ }
+ fprintf(fp_save, "x\n");
+ fprintf(fp_save, "%.17g %.17g\n", lower, upper);
+ for(i=1;i<=max_index;i++)
+ {
+ if(feature_min[i]!=feature_max[i])
+ fprintf(fp_save,"%d %.17g %.17g\n",i,feature_min[i],feature_max[i]);
+ }
+
+ if(min_index < 1)
+ fprintf(stderr,
+ "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
+
+ fclose(fp_save);
+ }
+
+ /* pass 3: scale */
+ while(readline(fp)!=NULL)
+ {
+ char *p=line;
+ int next_index=1;
+ double target;
+ double value;
+
+ if (sscanf(p,"%lf",&target) != 1)
+ return clean_up(NULL, fp, "ERROR: failed to read labels\n");
+ output_target(target);
+
+ SKIP_TARGET
+
+ while(sscanf(p,"%d:%lf",&index,&value)==2)
+ {
+ for(i=next_index;i<index;i++)
+ output(i,0);
+
+ output(index,value);
+
+ SKIP_ELEMENT
+ next_index=index+1;
+ }
+
+ for(i=next_index;i<=max_index;i++)
+ output(i,0);
+
+ printf("\n");
+ }
+
+ if (new_num_nonzeros > num_nonzeros)
+ fprintf(stderr,
+ "WARNING: original #nonzeros %ld\n"
+ " > new #nonzeros %ld\n"
+ "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
+ num_nonzeros, new_num_nonzeros);
+
+ free(line);
+ free(feature_max);
+ free(feature_min);
+ fclose(fp);
+ return 0;
+}
+
+char* readline(FILE *input)
+{
+ int len;
+
+ if(fgets(line,max_line_len,input) == NULL)
+ return NULL;
+
+ while(strrchr(line,'\n') == NULL)
+ {
+ max_line_len *= 2;
+ line = (char *) realloc(line, max_line_len);
+ len = (int) strlen(line);
+ if(fgets(line+len,max_line_len-len,input) == NULL)
+ break;
+ }
+ return line;
+}
+
+void output_target(double value)
+{
+ if(y_scaling)
+ {
+ if(value == y_min)
+ value = y_lower;
+ else if(value == y_max)
+ value = y_upper;
+ else value = y_lower + (y_upper-y_lower) *
+ (value - y_min)/(y_max-y_min);
+ }
+ printf("%.17g ",value);
+}
+
+void output(int index, double value)
+{
+ /* skip single-valued attribute */
+ if(feature_max[index] == feature_min[index])
+ return;
+
+ if(value == feature_min[index])
+ value = lower;
+ else if(value == feature_max[index])
+ value = upper;
+ else
+ value = lower + (upper-lower) *
+ (value-feature_min[index])/
+ (feature_max[index]-feature_min[index]);
+
+ if(value != 0)
+ {
+ printf("%d:%g ",index, value);
+ new_num_nonzeros++;
+ }
+}
+
+int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
+{
+ fprintf(stderr, "%s", msg);
+ free(line);
+ free(feature_max);
+ free(feature_min);
+ fclose(fp);
+ if (fp_restore)
+ fclose(fp_restore);
+ return -1;
+}
+