From: Chih-Jen Lin <cjlin@csie.ntu.edu.tw>
Date: Thu, 26 Jul 2018 15:11:23 +0000 (+0800)
Subject:     Include LIBSVM's svm-scale.c in the package.
X-Git-Tag: v221~7
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=44012bc10f8dc5fc9dbf75b93b4fa3e0e3c29200;p=liblinear

Include LIBSVM's svm-scale.c in the package.

    Add two functions (csr_find_scale_param, csr_scale) for data scaling in python interface.

    Remove utility functions (svm_read_problem, evaluations) in liblinearutil.py and directly use them from LIBSVM's commonutil.py. Functions for scaling are also put in commonutil.py.
---

diff --git a/README b/README
index 53a97e4..287edb1 100644
--- a/README
+++ b/README
@@ -17,6 +17,7 @@ Table of Contents
 - Installation
 - `train' Usage
 - `predict' Usage
+- `svm-scale' Usage
 - Examples
 - Library Usage
 - Building Windows Binaries
@@ -75,8 +76,8 @@ sparse data, use `-l 0' to keep the sparsity.
 Installation
 ============
 
-On Unix systems, type `make' to build the `train' and `predict'
-programs. Run them without arguments to show the usages.
+On Unix systems, type `make' to build the `train', `predict',
+and `svm-scale' programs. Run them without arguments to show the usages.
 
 On other systems, consult `Makefile' to build them (e.g., see
 'Building Windows binaries' in this file) or use the pre-built
@@ -92,6 +93,8 @@ and mark
 
         LIBS ?= blas/blas.a
 
+The tool `svm-scale', borrowed from LIBSVM, is for scaling input data file.
+
 `train' Usage
 =============
 
@@ -233,6 +236,11 @@ options:
 Note that -b is only needed in the prediction phase. This is different
 from the setting of LIBSVM.
 
+`svm-scale' Usage
+=================
+
+See LIBSVM README.
+
 Examples
 ========
 
@@ -437,7 +445,7 @@ in linear.h, so you can check the version number.
 
 - Function: void find_parameter_C(const struct problem *prob,
             const struct parameter *param, int nr_fold, double start_C,
-	    double max_C, double *best_C, double *best_rate);
+            double max_C, double *best_C, double *best_rate);
 
     This function is similar to cross_validation. However, instead of
     conducting cross validation under a specified parameter C, it
@@ -581,8 +589,8 @@ nmake -f Makefile.win clean all
 nmake -f Makefile.win lib
 
 4. (Optional) To build 32-bit windows binaries, you must
-	(1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat
-	(2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32
+        (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat
+        (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32
 
 MATLAB/OCTAVE Interface
 =======================
diff --git a/python/README b/python/README
index 351e70f..ccc3531 100644
--- a/python/README
+++ b/python/README
@@ -108,6 +108,11 @@ in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface.
 >>> param = parameter('-s 0 -c 4 -B 1')
 >>> m = train(prob, param)
 
+# Apply data scaling in Scipy format
+>>> y, x = svm_read_problem('../heart_scale', return_scipy=True)
+>>> scale_param = csr_find_scale_param(x, lower=0)
+>>> scaled_x = csr_scale(x, scale_param)
+
 # Other utility functions
 >>> save_model('heart_scale.model', m)
 >>> m = load_model('heart_scale.model')
@@ -193,10 +198,10 @@ LIBLINEAR shared library:
 
     y: a Python list/tuple/ndarray of l labels (type must be int/double).
 
-	x: 1. a list/tuple of l training instances. Feature vector of
-	      each training instance is a list/tuple or dictionary.
+    x: 1. a list/tuple of l training instances. Feature vector of
+          each training instance is a list/tuple or dictionary.
 
-	   2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+       2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
 
     bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term
           added (default -1)
@@ -310,10 +315,10 @@ The above command loads
 
     y: a list/tuple/ndarray of l training labels (type must be int/double).
 
-	x: 1. a list/tuple of l training instances. Feature vector of
-	      each training instance is a list/tuple or dictionary.
+    x: 1. a list/tuple of l training instances. Feature vector of
+          each training instance is a list/tuple or dictionary.
 
-	   2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+       2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
 
     training_options: a string in the same form as that for LIBLINEAR command
                       mode.
@@ -356,13 +361,13 @@ The above command loads
     >>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options'])
 
     y: a list/tuple/ndarray of l true labels (type must be int/double).
-	   It is used for calculating the accuracy. Use [] if true labels are
+       It is used for calculating the accuracy. Use [] if true labels are
        unavailable.
 
-	x: 1. a list/tuple of l training instances. Feature vector of
-	      each training instance is a list/tuple or dictionary.
+    x: 1. a list/tuple of l training instances. Feature vector of
+          each training instance is a list/tuple or dictionary.
 
-	   2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+       2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
 
     predicting_options: a string of predicting options in the same format as
                         that of LIBLINEAR.
@@ -399,7 +404,7 @@ The above command loads
 
 - Function: evaluations
 
-    Calculate some evaluations using the true values (ty) and predicted
+    Calculate some evaluations using the true values (ty) and the predicted
     values (pv):
 
     >>> (ACC, MSE, SCC) = evaluations(ty, pv, useScipy)
@@ -416,6 +421,26 @@ The above command loads
 
     SCC: squared correlation coefficient.
 
+- Function: csr_find_scale_parameter/csr_scale
+
+    Scale data in csr format.
+
+    >>> param = csr_find_scale_param(x [, lower=l, upper=u])
+    >>> x = csr_scale(x, param)
+
+    x: a csr_matrix of data.
+
+    l: x scaling lower limit; default -1.
+
+    u: x scaling upper limit; default 1.
+
+    The scaling process is: x * diag(coef) + ones(l, 1) * offset'
+
+    param: a dictionary of scaling parameters, where param['coef'] = coef and param['offset'] = offset.
+
+    coef: a scipy array of scaling coefficients.
+
+    offset: a scipy array of scaling offsets.
 
 Additional Information
 ======================
diff --git a/python/commonutil.py b/python/commonutil.py
new file mode 100644
index 0000000..f2da8de
--- /dev/null
+++ b/python/commonutil.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import sys
+
+try:
+	import scipy
+	from scipy import sparse
+except:
+	scipy = None
+	sparse = None
+
+
+__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
+
+def svm_read_problem(data_file_name, return_scipy=False):
+	"""
+	svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
+	svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
+
+	Read LIBSVM-format data from data_file_name and return labels y
+	and data instances x.
+	"""
+	prob_y = []
+	prob_x = []
+	row_ptr = [0]
+	col_idx = []
+	for i, line in enumerate(open(data_file_name)):
+		line = line.split(None, 1)
+		# In case an instance with all zero features
+		if len(line) == 1: line += ['']
+		label, features = line
+		prob_y += [float(label)]
+		if scipy != None and return_scipy:
+			nz = 0
+			for e in features.split():
+				ind, val = e.split(":")
+				val = float(val)
+				if val != 0:
+					col_idx += [int(ind)-1]
+					prob_x += [val]
+					nz += 1
+			row_ptr += [row_ptr[-1]+nz]
+		else:
+			xi = {}
+			for e in features.split():
+				ind, val = e.split(":")
+				xi[int(ind)] = float(val)
+			prob_x += [xi]
+	if scipy != None and return_scipy:
+		prob_y = scipy.array(prob_y)
+		prob_x = scipy.array(prob_x)
+		col_idx = scipy.array(col_idx)
+		row_ptr = scipy.array(row_ptr)
+		prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
+	return (prob_y, prob_x)
+
+def evaluations_scipy(ty, pv):
+	"""
+	evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
+	ty, pv: ndarray
+
+	Calculate accuracy, mean squared error and squared correlation coefficient
+	using the true values (ty) and predicted values (pv).
+	"""
+	if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
+		raise TypeError("type of ty and pv must be ndarray")
+	if len(ty) != len(pv):
+		raise ValueError("len(ty) must be equal to len(pv)")
+	ACC = 100.0*(ty == pv).mean()
+	MSE = ((ty - pv)**2).mean()
+	l = len(ty)
+	sumv = pv.sum()
+	sumy = ty.sum()
+	sumvy = (pv*ty).sum()
+	sumvv = (pv*pv).sum()
+	sumyy = (ty*ty).sum()
+	with scipy.errstate(all = 'raise'):
+		try:
+			SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+		except:
+			SCC = float('nan')
+	return (float(ACC), float(MSE), float(SCC))
+
+def evaluations(ty, pv, useScipy = True):
+	"""
+	evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
+	ty, pv: list, tuple or ndarray
+	useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
+
+	Calculate accuracy, mean squared error and squared correlation coefficient
+	using the true values (ty) and predicted values (pv).
+	"""
+	if scipy != None and useScipy:
+		return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
+	if len(ty) != len(pv):
+		raise ValueError("len(ty) must be equal to len(pv)")
+	total_correct = total_error = 0
+	sumv = sumy = sumvv = sumyy = sumvy = 0
+	for v, y in zip(pv, ty):
+		if y == v:
+			total_correct += 1
+		total_error += (v-y)*(v-y)
+		sumv += v
+		sumy += y
+		sumvv += v*v
+		sumyy += y*y
+		sumvy += v*y
+	l = len(ty)
+	ACC = 100.0*total_correct/l
+	MSE = total_error/l
+	try:
+		SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+	except:
+		SCC = float('nan')
+	return (float(ACC), float(MSE), float(SCC))
+
+def csr_find_scale_param(x, lower=-1, upper=1):
+	assert isinstance(x, sparse.csr_matrix)
+	assert lower < upper
+	l, n = x.shape
+	feat_min = x.min(axis=0).toarray().flatten()
+	feat_max = x.max(axis=0).toarray().flatten()
+	coef = (feat_max - feat_min) / (upper - lower)
+	coef[coef != 0] = 1.0 / coef[coef != 0]
+
+	# (x - ones(l,1) * feat_min') * diag(coef) + lower
+	# = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
+	# = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
+	# = x * diag(coef) + ones(l, 1) * offset'
+	offset = -feat_min * coef + lower
+	offset[coef == 0] = 0
+
+	if sum(offset != 0) * l > 3 * x.getnnz():
+		print(
+			"WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
+			"If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
+			file=sys.stderr)
+
+	return {'coef':coef, 'offset':offset}
+
+def csr_scale(x, scale_param):
+	assert isinstance(x, sparse.csr_matrix)
+
+	offset = scale_param['offset']
+	coef = scale_param['coef']
+	assert len(coef) == len(offset)
+
+	l, n = x.shape
+
+	if not n == len(coef):
+		print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
+		coef = resize(coef, n)
+		offset = resize(offset, n)
+
+	# scaled_x = x * diag(coef) + ones(l, 1) * offset'
+	offset = sparse.csr_matrix(offset.reshape(1, n))
+	offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
+	scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
+
+	if scaled_x.getnnz() > x.getnnz():
+		print(
+			"WARNING: original #nonzeros %d\n" % x.getnnz() +
+			"       > new      #nonzeros %d\n" % scaled_x.getnnz() +
+			"If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
+			file=sys.stderr)
diff --git a/python/liblinearutil.py b/python/liblinearutil.py
index d3f3015..9a07f59 100644
--- a/python/liblinearutil.py
+++ b/python/liblinearutil.py
@@ -5,58 +5,17 @@ sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
 from liblinear import *
 from liblinear import __all__ as liblinear_all
 from liblinear import scipy, sparse
+from commonutil import *
+from commonutil import __all__ as common_all
 from ctypes import c_double
 
 if sys.version_info[0] < 3:
 	range = xrange
 	from itertools import izip as zip
 
-__all__ = ['svm_read_problem', 'load_model', 'save_model', 'evaluations',
-           'train', 'predict'] + liblinear_all
+__all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
 
 
-def svm_read_problem(data_file_name, return_scipy=False):
-	"""
-	svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
-	svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
-
-	Read LIBSVM-format data from data_file_name and return labels y
-	and data instances x.
-	"""
-	prob_y = []
-	prob_x = []
-	row_ptr = [0]
-	col_idx = []
-	for i, line in enumerate(open(data_file_name)):
-		line = line.split(None, 1)
-		# In case an instance with all zero features
-		if len(line) == 1: line += ['']
-		label, features = line
-		prob_y += [float(label)]
-		if scipy != None and return_scipy:
-			nz = 0
-			for e in features.split():
-				ind, val = e.split(":")
-				val = float(val)
-				if val != 0:
-					col_idx += [int(ind)-1]
-					prob_x += [val]
-					nz += 1
-			row_ptr += [row_ptr[-1]+nz]
-		else:
-			xi = {}
-			for e in features.split():
-				ind, val = e.split(":")
-				xi[int(ind)] = float(val)
-			prob_x += [xi]
-	if scipy != None and return_scipy:
-		prob_y = scipy.array(prob_y)
-		prob_x = scipy.array(prob_x)
-		col_idx = scipy.array(col_idx)
-		row_ptr = scipy.array(row_ptr)
-		prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
-	return (prob_y, prob_x)
-
 def load_model(model_file_name):
 	"""
 	load_model(model_file_name) -> model
@@ -78,66 +37,6 @@ def save_model(model_file_name, model):
 	"""
 	liblinear.save_model(model_file_name.encode(), model)
 
-def evaluations_scipy(ty, pv):
-	"""
-	evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
-	ty, pv: ndarray
-
-	Calculate accuracy, mean squared error and squared correlation coefficient
-	using the true values (ty) and predicted values (pv).
-	"""
-	if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
-		raise TypeError("type of ty and pv must be ndarray")
-	if len(ty) != len(pv):
-		raise ValueError("len(ty) must be equal to len(pv)")
-	ACC = 100.0*(ty == pv).mean()
-	MSE = ((ty - pv)**2).mean()
-	l = len(ty)
-	sumv = pv.sum()
-	sumy = ty.sum()
-	sumvy = (pv*ty).sum()
-	sumvv = (pv*pv).sum()
-	sumyy = (ty*ty).sum()
-	with scipy.errstate(all = 'raise'):
-		try:
-			SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
-		except:
-			SCC = float('nan')
-	return (float(ACC), float(MSE), float(SCC))
-
-def evaluations(ty, pv, useScipy = True):
-	"""
-	evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
-	ty, pv: list, tuple or ndarray
-	useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
-
-	Calculate accuracy, mean squared error and squared correlation coefficient
-	using the true values (ty) and predicted values (pv).
-	"""
-	if scipy != None and useScipy:
-		return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
-	if len(ty) != len(pv):
-		raise ValueError("len(ty) must be equal to len(pv)")
-	total_correct = total_error = 0
-	sumv = sumy = sumvv = sumyy = sumvy = 0
-	for v, y in zip(pv, ty):
-		if y == v:
-			total_correct += 1
-		total_error += (v-y)*(v-y)
-		sumv += v
-		sumy += y
-		sumvv += v*v
-		sumyy += y*y
-		sumvy += v*y
-	l = len(ty)
-	ACC = 100.0*total_correct/l
-	MSE = total_error/l
-	try:
-		SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
-	except:
-		SCC = float('nan')
-	return (float(ACC), float(MSE), float(SCC))
-
 def train(arg1, arg2=None, arg3=None):
 	"""
 	train(y, x [, options]) -> model | ACC
diff --git a/svm-scale.c b/svm-scale.c
new file mode 100644
index 0000000..63d1677
--- /dev/null
+++ b/svm-scale.c
@@ -0,0 +1,405 @@
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+void exit_with_help()
+{
+	printf(
+	"Usage: svm-scale [options] data_filename\n"
+	"options:\n"
+	"-l lower : x scaling lower limit (default -1)\n"
+	"-u upper : x scaling upper limit (default +1)\n"
+	"-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
+	"-s save_filename : save scaling parameters to save_filename\n"
+	"-r restore_filename : restore scaling parameters from restore_filename\n"
+	);
+	exit(1);
+}
+
+char *line = NULL;
+int max_line_len = 1024;
+double lower=-1.0,upper=1.0,y_lower,y_upper;
+int y_scaling = 0;
+double *feature_max;
+double *feature_min;
+double y_max = -DBL_MAX;
+double y_min = DBL_MAX;
+int max_index;
+int min_index;
+long int num_nonzeros = 0;
+long int new_num_nonzeros = 0;
+
+#define max(x,y) (((x)>(y))?(x):(y))
+#define min(x,y) (((x)<(y))?(x):(y))
+
+void output_target(double value);
+void output(int index, double value);
+char* readline(FILE *input);
+int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
+
+int main(int argc,char **argv)
+{
+	int i,index;
+	FILE *fp, *fp_restore = NULL;
+	char *save_filename = NULL;
+	char *restore_filename = NULL;
+
+	for(i=1;i<argc;i++)
+	{
+		if(argv[i][0] != '-') break;
+		++i;
+		switch(argv[i-1][1])
+		{
+			case 'l': lower = atof(argv[i]); break;
+			case 'u': upper = atof(argv[i]); break;
+			case 'y':
+				y_lower = atof(argv[i]);
+				++i;
+				y_upper = atof(argv[i]);
+				y_scaling = 1;
+				break;
+			case 's': save_filename = argv[i]; break;
+			case 'r': restore_filename = argv[i]; break;
+			default:
+				fprintf(stderr,"unknown option\n");
+				exit_with_help();
+		}
+	}
+
+	if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
+	{
+		fprintf(stderr,"inconsistent lower/upper specification\n");
+		exit(1);
+	}
+
+	if(restore_filename && save_filename)
+	{
+		fprintf(stderr,"cannot use -r and -s simultaneously\n");
+		exit(1);
+	}
+
+	if(argc != i+1)
+		exit_with_help();
+
+	fp=fopen(argv[i],"r");
+
+	if(fp==NULL)
+	{
+		fprintf(stderr,"can't open file %s\n", argv[i]);
+		exit(1);
+	}
+
+	line = (char *) malloc(max_line_len*sizeof(char));
+
+#define SKIP_TARGET\
+	while(isspace(*p)) ++p;\
+	while(!isspace(*p)) ++p;
+
+#define SKIP_ELEMENT\
+	while(*p!=':') ++p;\
+	++p;\
+	while(isspace(*p)) ++p;\
+	while(*p && !isspace(*p)) ++p;
+
+	/* assumption: min index of attributes is 1 */
+	/* pass 1: find out max index of attributes */
+	max_index = 0;
+	min_index = 1;
+
+	if(restore_filename)
+	{
+		int idx, c;
+
+		fp_restore = fopen(restore_filename,"r");
+		if(fp_restore==NULL)
+		{
+			fprintf(stderr,"can't open file %s\n", restore_filename);
+			exit(1);
+		}
+
+		c = fgetc(fp_restore);
+		if(c == 'y')
+		{
+			readline(fp_restore);
+			readline(fp_restore);
+			readline(fp_restore);
+		}
+		readline(fp_restore);
+		readline(fp_restore);
+
+		while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
+			max_index = max(idx,max_index);
+		rewind(fp_restore);
+	}
+
+	while(readline(fp)!=NULL)
+	{
+		char *p=line;
+
+		SKIP_TARGET
+
+		while(sscanf(p,"%d:%*f",&index)==1)
+		{
+			max_index = max(max_index, index);
+			min_index = min(min_index, index);
+			SKIP_ELEMENT
+			num_nonzeros++;
+		}
+	}
+
+	if(min_index < 1)
+		fprintf(stderr,
+			"WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
+
+	rewind(fp);
+
+	feature_max = (double *)malloc((max_index+1)* sizeof(double));
+	feature_min = (double *)malloc((max_index+1)* sizeof(double));
+
+	if(feature_max == NULL || feature_min == NULL)
+	{
+		fprintf(stderr,"can't allocate enough memory\n");
+		exit(1);
+	}
+
+	for(i=0;i<=max_index;i++)
+	{
+		feature_max[i]=-DBL_MAX;
+		feature_min[i]=DBL_MAX;
+	}
+
+	/* pass 2: find out min/max value */
+	while(readline(fp)!=NULL)
+	{
+		char *p=line;
+		int next_index=1;
+		double target;
+		double value;
+
+		if (sscanf(p,"%lf",&target) != 1)
+			return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
+		y_max = max(y_max,target);
+		y_min = min(y_min,target);
+
+		SKIP_TARGET
+
+		while(sscanf(p,"%d:%lf",&index,&value)==2)
+		{
+			for(i=next_index;i<index;i++)
+			{
+				feature_max[i]=max(feature_max[i],0);
+				feature_min[i]=min(feature_min[i],0);
+			}
+
+			feature_max[index]=max(feature_max[index],value);
+			feature_min[index]=min(feature_min[index],value);
+
+			SKIP_ELEMENT
+			next_index=index+1;
+		}
+
+		for(i=next_index;i<=max_index;i++)
+		{
+			feature_max[i]=max(feature_max[i],0);
+			feature_min[i]=min(feature_min[i],0);
+		}
+	}
+
+	rewind(fp);
+
+	/* pass 2.5: save/restore feature_min/feature_max */
+
+	if(restore_filename)
+	{
+		/* fp_restore rewinded in finding max_index */
+		int idx, c;
+		double fmin, fmax;
+		int next_index = 1;
+
+		if((c = fgetc(fp_restore)) == 'y')
+		{
+			if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
+			   fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
+				return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
+			y_scaling = 1;
+		}
+		else
+			ungetc(c, fp_restore);
+
+		if (fgetc(fp_restore) == 'x')
+		{
+			if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
+				return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
+			while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
+			{
+				for(i = next_index;i<idx;i++)
+					if(feature_min[i] != feature_max[i])
+					{
+						fprintf(stderr,
+							"WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
+							i, argv[argc-1], restore_filename);
+						feature_min[i] = 0;
+						feature_max[i] = 0;
+					}
+
+				feature_min[idx] = fmin;
+				feature_max[idx] = fmax;
+
+				next_index = idx + 1;
+			}
+
+			for(i=next_index;i<=max_index;i++)
+				if(feature_min[i] != feature_max[i])
+				{
+					fprintf(stderr,
+						"WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
+						i, argv[argc-1], restore_filename);
+					feature_min[i] = 0;
+					feature_max[i] = 0;
+				}
+		}
+		fclose(fp_restore);
+	}
+
+	if(save_filename)
+	{
+		FILE *fp_save = fopen(save_filename,"w");
+		if(fp_save==NULL)
+		{
+			fprintf(stderr,"can't open file %s\n", save_filename);
+			exit(1);
+		}
+		if(y_scaling)
+		{
+			fprintf(fp_save, "y\n");
+			fprintf(fp_save, "%.17g %.17g\n", y_lower, y_upper);
+			fprintf(fp_save, "%.17g %.17g\n", y_min, y_max);
+		}
+		fprintf(fp_save, "x\n");
+		fprintf(fp_save, "%.17g %.17g\n", lower, upper);
+		for(i=1;i<=max_index;i++)
+		{
+			if(feature_min[i]!=feature_max[i])
+				fprintf(fp_save,"%d %.17g %.17g\n",i,feature_min[i],feature_max[i]);
+		}
+
+		if(min_index < 1)
+			fprintf(stderr,
+				"WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
+
+		fclose(fp_save);
+	}
+
+	/* pass 3: scale */
+	while(readline(fp)!=NULL)
+	{
+		char *p=line;
+		int next_index=1;
+		double target;
+		double value;
+
+		if (sscanf(p,"%lf",&target) != 1)
+			return clean_up(NULL, fp, "ERROR: failed to read labels\n");
+		output_target(target);
+
+		SKIP_TARGET
+
+		while(sscanf(p,"%d:%lf",&index,&value)==2)
+		{
+			for(i=next_index;i<index;i++)
+				output(i,0);
+
+			output(index,value);
+
+			SKIP_ELEMENT
+			next_index=index+1;
+		}
+
+		for(i=next_index;i<=max_index;i++)
+			output(i,0);
+
+		printf("\n");
+	}
+
+	if (new_num_nonzeros > num_nonzeros)
+		fprintf(stderr,
+			"WARNING: original #nonzeros %ld\n"
+			"       > new      #nonzeros %ld\n"
+			"If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
+			num_nonzeros, new_num_nonzeros);
+
+	free(line);
+	free(feature_max);
+	free(feature_min);
+	fclose(fp);
+	return 0;
+}
+
+char* readline(FILE *input)
+{
+	int len;
+
+	if(fgets(line,max_line_len,input) == NULL)
+		return NULL;
+
+	while(strrchr(line,'\n') == NULL)
+	{
+		max_line_len *= 2;
+		line = (char *) realloc(line, max_line_len);
+		len = (int) strlen(line);
+		if(fgets(line+len,max_line_len-len,input) == NULL)
+			break;
+	}
+	return line;
+}
+
+void output_target(double value)
+{
+	if(y_scaling)
+	{
+		if(value == y_min)
+			value = y_lower;
+		else if(value == y_max)
+			value = y_upper;
+		else value = y_lower + (y_upper-y_lower) *
+			     (value - y_min)/(y_max-y_min);
+	}
+	printf("%.17g ",value);
+}
+
+void output(int index, double value)
+{
+	/* skip single-valued attribute */
+	if(feature_max[index] == feature_min[index])
+		return;
+
+	if(value == feature_min[index])
+		value = lower;
+	else if(value == feature_max[index])
+		value = upper;
+	else
+		value = lower + (upper-lower) *
+			(value-feature_min[index])/
+			(feature_max[index]-feature_min[index]);
+
+	if(value != 0)
+	{
+		printf("%d:%g ",index, value);
+		new_num_nonzeros++;
+	}
+}
+
+int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
+{
+	fprintf(stderr,	"%s", msg);
+	free(line);
+	free(feature_max);
+	free(feature_min);
+	fclose(fp);
+	if (fp_restore)
+		fclose(fp_restore);
+	return -1;
+}
+