Include LIBSVM's svm-scale.c in the package.

author Chih-Jen Lin <cjlin@csie.ntu.edu.tw>

Thu, 26 Jul 2018 15:11:23 +0000 (23:11 +0800)

committer Chih-Jen Lin <cjlin@csie.ntu.edu.tw>

Thu, 26 Jul 2018 15:11:23 +0000 (23:11 +0800)
author Chih-Jen Lin <cjlin@csie.ntu.edu.tw>
Thu, 26 Jul 2018 15:11:23 +0000 (23:11 +0800)
committer Chih-Jen Lin <cjlin@csie.ntu.edu.tw>
Thu, 26 Jul 2018 15:11:23 +0000 (23:11 +0800)
diff --git a/README b/README

index 53a97e45e9f6f7fda18b08434333ac105929002d..287edb161238312c7745afd7db39d41dae16b0ea 100644 (file)
--- a/README
+++ b/README
@@ -17,6 +17,7 @@ Table of Contents
  - Installation
  - `train' Usage
  - `predict' Usage
+- `svm-scale' Usage
  - Examples
  - Library Usage
  - Building Windows Binaries
@@ -75,8 +76,8 @@ sparse data, use `-l 0' to keep the sparsity.
  Installation
  ============
  
-On Unix systems, type `make' to build the `train' and `predict'
-programs. Run them without arguments to show the usages.
+On Unix systems, type `make' to build the `train', `predict',
+and `svm-scale' programs. Run them without arguments to show the usages.
  
  On other systems, consult `Makefile' to build them (e.g., see
  'Building Windows binaries' in this file) or use the pre-built
@@ -92,6 +93,8 @@ and mark
  
          LIBS ?= blas/blas.a
  
+The tool `svm-scale', borrowed from LIBSVM, is for scaling input data file.
+
  `train' Usage
  =============
  
@@ -233,6 +236,11 @@ options:
  Note that -b is only needed in the prediction phase. This is different
  from the setting of LIBSVM.
  
+`svm-scale' Usage
+=================
+
+See LIBSVM README.
+
  Examples
  ========
  
@@ -437,7 +445,7 @@ in linear.h, so you can check the version number.
  
  - Function: void find_parameter_C(const struct problem *prob,
              const struct parameter *param, int nr_fold, double start_C,
-           double max_C, double *best_C, double *best_rate);
+            double max_C, double *best_C, double *best_rate);
  
      This function is similar to cross_validation. However, instead of
      conducting cross validation under a specified parameter C, it
@@ -581,8 +589,8 @@ nmake -f Makefile.win clean all
  nmake -f Makefile.win lib
  
  4. (Optional) To build 32-bit windows binaries, you must
-       (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat
-       (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32
+        (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\vcvars32.bat" instead of vcvars64.bat
+        (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32
  
  MATLAB/OCTAVE Interface
  =======================
diff --git a/python/README b/python/README

index 351e70fba875db178eb7de2f3299bdca2aee1a81..ccc3531022298c698c14ca1326527ffd3937d53b 100644 (file)
--- a/python/README
+++ b/python/README
@@ -108,6 +108,11 @@ in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface.
  >>> param = parameter('-s 0 -c 4 -B 1')
  >>> m = train(prob, param)
  
+# Apply data scaling in Scipy format
+>>> y, x = svm_read_problem('../heart_scale', return_scipy=True)
+>>> scale_param = csr_find_scale_param(x, lower=0)
+>>> scaled_x = csr_scale(x, scale_param)
+
  # Other utility functions
  >>> save_model('heart_scale.model', m)
  >>> m = load_model('heart_scale.model')
@@ -193,10 +198,10 @@ LIBLINEAR shared library:
  
      y: a Python list/tuple/ndarray of l labels (type must be int/double).
  
-       x: 1. a list/tuple of l training instances. Feature vector of
-             each training instance is a list/tuple or dictionary.
+    x: 1. a list/tuple of l training instances. Feature vector of
+          each training instance is a list/tuple or dictionary.
  
-          2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+       2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
  
      bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term
            added (default -1)
@@ -310,10 +315,10 @@ The above command loads
  
      y: a list/tuple/ndarray of l training labels (type must be int/double).
  
-       x: 1. a list/tuple of l training instances. Feature vector of
-             each training instance is a list/tuple or dictionary.
+    x: 1. a list/tuple of l training instances. Feature vector of
+          each training instance is a list/tuple or dictionary.
  
-          2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+       2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
  
      training_options: a string in the same form as that for LIBLINEAR command
                        mode.
@@ -356,13 +361,13 @@ The above command loads
      >>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options'])
  
      y: a list/tuple/ndarray of l true labels (type must be int/double).
-          It is used for calculating the accuracy. Use [] if true labels are
+       It is used for calculating the accuracy. Use [] if true labels are
         unavailable.
  
-       x: 1. a list/tuple of l training instances. Feature vector of
-             each training instance is a list/tuple or dictionary.
+    x: 1. a list/tuple of l training instances. Feature vector of
+          each training instance is a list/tuple or dictionary.
  
-          2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
+       2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
  
      predicting_options: a string of predicting options in the same format as
                          that of LIBLINEAR.
@@ -399,7 +404,7 @@ The above command loads
  
  - Function: evaluations
  
-    Calculate some evaluations using the true values (ty) and predicted
+    Calculate some evaluations using the true values (ty) and the predicted
      values (pv):
  
      >>> (ACC, MSE, SCC) = evaluations(ty, pv, useScipy)
@@ -416,6 +421,26 @@ The above command loads
  
      SCC: squared correlation coefficient.
  
+- Function: csr_find_scale_parameter/csr_scale
+
+    Scale data in csr format.
+
+    >>> param = csr_find_scale_param(x [, lower=l, upper=u])
+    >>> x = csr_scale(x, param)
+
+    x: a csr_matrix of data.
+
+    l: x scaling lower limit; default -1.
+
+    u: x scaling upper limit; default 1.
+
+    The scaling process is: x * diag(coef) + ones(l, 1) * offset'
+
+    param: a dictionary of scaling parameters, where param['coef'] = coef and param['offset'] = offset.
+
+    coef: a scipy array of scaling coefficients.
+
+    offset: a scipy array of scaling offsets.
  
  Additional Information
  ======================
diff --git a/python/commonutil.py b/python/commonutil.py

new file mode 100644 (file)

index 0000000..f2da8de
--- /dev/null
+++ b/python/commonutil.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import sys
+
+try:
+       import scipy
+       from scipy import sparse
+except:
+       scipy = None
+       sparse = None
+
+
+__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
+
+def svm_read_problem(data_file_name, return_scipy=False):
+       """
+       svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
+       svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
+
+       Read LIBSVM-format data from data_file_name and return labels y
+       and data instances x.
+       """
+       prob_y = []
+       prob_x = []
+       row_ptr = [0]
+       col_idx = []
+       for i, line in enumerate(open(data_file_name)):
+               line = line.split(None, 1)
+               # In case an instance with all zero features
+               if len(line) == 1: line += ['']
+               label, features = line
+               prob_y += [float(label)]
+               if scipy != None and return_scipy:
+                       nz = 0
+                       for e in features.split():
+                               ind, val = e.split(":")
+                               val = float(val)
+                               if val != 0:
+                                       col_idx += [int(ind)-1]
+                                       prob_x += [val]
+                                       nz += 1
+                       row_ptr += [row_ptr[-1]+nz]
+               else:
+                       xi = {}
+                       for e in features.split():
+                               ind, val = e.split(":")
+                               xi[int(ind)] = float(val)
+                       prob_x += [xi]
+       if scipy != None and return_scipy:
+               prob_y = scipy.array(prob_y)
+               prob_x = scipy.array(prob_x)
+               col_idx = scipy.array(col_idx)
+               row_ptr = scipy.array(row_ptr)
+               prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
+       return (prob_y, prob_x)
+
+def evaluations_scipy(ty, pv):
+       """
+       evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
+       ty, pv: ndarray
+
+       Calculate accuracy, mean squared error and squared correlation coefficient
+       using the true values (ty) and predicted values (pv).
+       """
+       if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
+               raise TypeError("type of ty and pv must be ndarray")
+       if len(ty) != len(pv):
+               raise ValueError("len(ty) must be equal to len(pv)")
+       ACC = 100.0*(ty == pv).mean()
+       MSE = ((ty - pv)**2).mean()
+       l = len(ty)
+       sumv = pv.sum()
+       sumy = ty.sum()
+       sumvy = (pv*ty).sum()
+       sumvv = (pv*pv).sum()
+       sumyy = (ty*ty).sum()
+       with scipy.errstate(all = 'raise'):
+               try:
+                       SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+               except:
+                       SCC = float('nan')
+       return (float(ACC), float(MSE), float(SCC))
+
+def evaluations(ty, pv, useScipy = True):
+       """
+       evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
+       ty, pv: list, tuple or ndarray
+       useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
+
+       Calculate accuracy, mean squared error and squared correlation coefficient
+       using the true values (ty) and predicted values (pv).
+       """
+       if scipy != None and useScipy:
+               return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
+       if len(ty) != len(pv):
+               raise ValueError("len(ty) must be equal to len(pv)")
+       total_correct = total_error = 0
+       sumv = sumy = sumvv = sumyy = sumvy = 0
+       for v, y in zip(pv, ty):
+               if y == v:
+                       total_correct += 1
+               total_error += (v-y)*(v-y)
+               sumv += v
+               sumy += y
+               sumvv += v*v
+               sumyy += y*y
+               sumvy += v*y
+       l = len(ty)
+       ACC = 100.0*total_correct/l
+       MSE = total_error/l
+       try:
+               SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+       except:
+               SCC = float('nan')
+       return (float(ACC), float(MSE), float(SCC))
+
+def csr_find_scale_param(x, lower=-1, upper=1):
+       assert isinstance(x, sparse.csr_matrix)
+       assert lower < upper
+       l, n = x.shape
+       feat_min = x.min(axis=0).toarray().flatten()
+       feat_max = x.max(axis=0).toarray().flatten()
+       coef = (feat_max - feat_min) / (upper - lower)
+       coef[coef != 0] = 1.0 / coef[coef != 0]
+
+       # (x - ones(l,1) * feat_min') * diag(coef) + lower
+       # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
+       # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
+       # = x * diag(coef) + ones(l, 1) * offset'
+       offset = -feat_min * coef + lower
+       offset[coef == 0] = 0
+
+       if sum(offset != 0) * l > 3 * x.getnnz():
+               print(
+                       "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
+                       "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
+                       file=sys.stderr)
+
+       return {'coef':coef, 'offset':offset}
+
+def csr_scale(x, scale_param):
+       assert isinstance(x, sparse.csr_matrix)
+
+       offset = scale_param['offset']
+       coef = scale_param['coef']
+       assert len(coef) == len(offset)
+
+       l, n = x.shape
+
+       if not n == len(coef):
+               print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
+               coef = resize(coef, n)
+               offset = resize(offset, n)
+
+       # scaled_x = x * diag(coef) + ones(l, 1) * offset'
+       offset = sparse.csr_matrix(offset.reshape(1, n))
+       offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
+       scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
+
+       if scaled_x.getnnz() > x.getnnz():
+               print(
+                       "WARNING: original #nonzeros %d\n" % x.getnnz() +
+                       "       > new      #nonzeros %d\n" % scaled_x.getnnz() +
+                       "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
+                       file=sys.stderr)
diff --git a/python/liblinearutil.py b/python/liblinearutil.py

index d3f3015430b80ea1631fcea3501da83d1464196c..9a07f597277327378ed5f0f03cf2d9b18267dcac 100644 (file)
--- a/python/liblinearutil.py
+++ b/python/liblinearutil.py
@@ -5,58 +5,17 @@ sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
  from liblinear import *
  from liblinear import __all__ as liblinear_all
  from liblinear import scipy, sparse
+from commonutil import *
+from commonutil import __all__ as common_all
  from ctypes import c_double
  
  if sys.version_info[0] < 3:
         range = xrange
         from itertools import izip as zip
  
-__all__ = ['svm_read_problem', 'load_model', 'save_model', 'evaluations',
-           'train', 'predict'] + liblinear_all
+__all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
  
  
-def svm_read_problem(data_file_name, return_scipy=False):
-       """
-       svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
-       svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
-
-       Read LIBSVM-format data from data_file_name and return labels y
-       and data instances x.
-       """
-       prob_y = []
-       prob_x = []
-       row_ptr = [0]
-       col_idx = []
-       for i, line in enumerate(open(data_file_name)):
-               line = line.split(None, 1)
-               # In case an instance with all zero features
-               if len(line) == 1: line += ['']
-               label, features = line
-               prob_y += [float(label)]
-               if scipy != None and return_scipy:
-                       nz = 0
-                       for e in features.split():
-                               ind, val = e.split(":")
-                               val = float(val)
-                               if val != 0:
-                                       col_idx += [int(ind)-1]
-                                       prob_x += [val]
-                                       nz += 1
-                       row_ptr += [row_ptr[-1]+nz]
-               else:
-                       xi = {}
-                       for e in features.split():
-                               ind, val = e.split(":")
-                               xi[int(ind)] = float(val)
-                       prob_x += [xi]
-       if scipy != None and return_scipy:
-               prob_y = scipy.array(prob_y)
-               prob_x = scipy.array(prob_x)
-               col_idx = scipy.array(col_idx)
-               row_ptr = scipy.array(row_ptr)
-               prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
-       return (prob_y, prob_x)
-
  def load_model(model_file_name):
         """
         load_model(model_file_name) -> model
@@ -78,66 +37,6 @@ def save_model(model_file_name, model):
         """
         liblinear.save_model(model_file_name.encode(), model)
  
-def evaluations_scipy(ty, pv):
-       """
-       evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
-       ty, pv: ndarray
-
-       Calculate accuracy, mean squared error and squared correlation coefficient
-       using the true values (ty) and predicted values (pv).
-       """
-       if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
-               raise TypeError("type of ty and pv must be ndarray")
-       if len(ty) != len(pv):
-               raise ValueError("len(ty) must be equal to len(pv)")
-       ACC = 100.0*(ty == pv).mean()
-       MSE = ((ty - pv)**2).mean()
-       l = len(ty)
-       sumv = pv.sum()
-       sumy = ty.sum()
-       sumvy = (pv*ty).sum()
-       sumvv = (pv*pv).sum()
-       sumyy = (ty*ty).sum()
-       with scipy.errstate(all = 'raise'):
-               try:
-                       SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
-               except:
-                       SCC = float('nan')
-       return (float(ACC), float(MSE), float(SCC))
-
-def evaluations(ty, pv, useScipy = True):
-       """
-       evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
-       ty, pv: list, tuple or ndarray
-       useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
-
-       Calculate accuracy, mean squared error and squared correlation coefficient
-       using the true values (ty) and predicted values (pv).
-       """
-       if scipy != None and useScipy:
-               return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
-       if len(ty) != len(pv):
-               raise ValueError("len(ty) must be equal to len(pv)")
-       total_correct = total_error = 0
-       sumv = sumy = sumvv = sumyy = sumvy = 0
-       for v, y in zip(pv, ty):
-               if y == v:
-                       total_correct += 1
-               total_error += (v-y)*(v-y)
-               sumv += v
-               sumy += y
-               sumvv += v*v
-               sumyy += y*y
-               sumvy += v*y
-       l = len(ty)
-       ACC = 100.0*total_correct/l
-       MSE = total_error/l
-       try:
-               SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
-       except:
-               SCC = float('nan')
-       return (float(ACC), float(MSE), float(SCC))
-
  def train(arg1, arg2=None, arg3=None):
         """
         train(y, x [, options]) -> model | ACC
diff --git a/svm-scale.c b/svm-scale.c

new file mode 100644 (file)

index 0000000..63d1677
--- /dev/null
+++ b/svm-scale.c
@@ -0,0 +1,405 @@
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+void exit_with_help()
+{
+       printf(
+       "Usage: svm-scale [options] data_filename\n"
+       "options:\n"
+       "-l lower : x scaling lower limit (default -1)\n"
+       "-u upper : x scaling upper limit (default +1)\n"
+       "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
+       "-s save_filename : save scaling parameters to save_filename\n"
+       "-r restore_filename : restore scaling parameters from restore_filename\n"
+       );
+       exit(1);
+}
+
+char *line = NULL;
+int max_line_len = 1024;
+double lower=-1.0,upper=1.0,y_lower,y_upper;
+int y_scaling = 0;
+double *feature_max;
+double *feature_min;
+double y_max = -DBL_MAX;
+double y_min = DBL_MAX;
+int max_index;
+int min_index;
+long int num_nonzeros = 0;
+long int new_num_nonzeros = 0;
+
+#define max(x,y) (((x)>(y))?(x):(y))
+#define min(x,y) (((x)<(y))?(x):(y))
+
+void output_target(double value);
+void output(int index, double value);
+char* readline(FILE *input);
+int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
+
+int main(int argc,char **argv)
+{
+       int i,index;
+       FILE *fp, *fp_restore = NULL;
+       char *save_filename = NULL;
+       char *restore_filename = NULL;
+
+       for(i=1;i<argc;i++)
+       {
+               if(argv[i][0] != '-') break;
+               ++i;
+               switch(argv[i-1][1])
+               {
+                       case 'l': lower = atof(argv[i]); break;
+                       case 'u': upper = atof(argv[i]); break;
+                       case 'y':
+                               y_lower = atof(argv[i]);
+                               ++i;
+                               y_upper = atof(argv[i]);
+                               y_scaling = 1;
+                               break;
+                       case 's': save_filename = argv[i]; break;
+                       case 'r': restore_filename = argv[i]; break;
+                       default:
+                               fprintf(stderr,"unknown option\n");
+                               exit_with_help();
+               }
+       }
+
+       if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
+       {
+               fprintf(stderr,"inconsistent lower/upper specification\n");
+               exit(1);
+       }
+
+       if(restore_filename && save_filename)
+       {
+               fprintf(stderr,"cannot use -r and -s simultaneously\n");
+               exit(1);
+       }
+
+       if(argc != i+1)
+               exit_with_help();
+
+       fp=fopen(argv[i],"r");
+
+       if(fp==NULL)
+       {
+               fprintf(stderr,"can't open file %s\n", argv[i]);
+               exit(1);
+       }
+
+       line = (char *) malloc(max_line_len*sizeof(char));
+
+#define SKIP_TARGET\
+       while(isspace(*p)) ++p;\
+       while(!isspace(*p)) ++p;
+
+#define SKIP_ELEMENT\
+       while(*p!=':') ++p;\
+       ++p;\
+       while(isspace(*p)) ++p;\
+       while(*p && !isspace(*p)) ++p;
+
+       /* assumption: min index of attributes is 1 */
+       /* pass 1: find out max index of attributes */
+       max_index = 0;
+       min_index = 1;
+
+       if(restore_filename)
+       {
+               int idx, c;
+
+               fp_restore = fopen(restore_filename,"r");
+               if(fp_restore==NULL)
+               {
+                       fprintf(stderr,"can't open file %s\n", restore_filename);
+                       exit(1);
+               }
+
+               c = fgetc(fp_restore);
+               if(c == 'y')
+               {
+                       readline(fp_restore);
+                       readline(fp_restore);
+                       readline(fp_restore);
+               }
+               readline(fp_restore);
+               readline(fp_restore);
+
+               while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
+                       max_index = max(idx,max_index);
+               rewind(fp_restore);
+       }
+
+       while(readline(fp)!=NULL)
+       {
+               char *p=line;
+
+               SKIP_TARGET
+
+               while(sscanf(p,"%d:%*f",&index)==1)
+               {
+                       max_index = max(max_index, index);
+                       min_index = min(min_index, index);
+                       SKIP_ELEMENT
+                       num_nonzeros++;
+               }
+       }
+
+       if(min_index < 1)
+               fprintf(stderr,
+                       "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
+
+       rewind(fp);
+
+       feature_max = (double *)malloc((max_index+1)* sizeof(double));
+       feature_min = (double *)malloc((max_index+1)* sizeof(double));
+
+       if(feature_max == NULL || feature_min == NULL)
+       {
+               fprintf(stderr,"can't allocate enough memory\n");
+               exit(1);
+       }
+
+       for(i=0;i<=max_index;i++)
+       {
+               feature_max[i]=-DBL_MAX;
+               feature_min[i]=DBL_MAX;
+       }
+
+       /* pass 2: find out min/max value */
+       while(readline(fp)!=NULL)
+       {
+               char *p=line;
+               int next_index=1;
+               double target;
+               double value;
+
+               if (sscanf(p,"%lf",&target) != 1)
+                       return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
+               y_max = max(y_max,target);
+               y_min = min(y_min,target);
+
+               SKIP_TARGET
+
+               while(sscanf(p,"%d:%lf",&index,&value)==2)
+               {
+                       for(i=next_index;i<index;i++)
+                       {
+                               feature_max[i]=max(feature_max[i],0);
+                               feature_min[i]=min(feature_min[i],0);
+                       }
+
+                       feature_max[index]=max(feature_max[index],value);
+                       feature_min[index]=min(feature_min[index],value);
+
+                       SKIP_ELEMENT
+                       next_index=index+1;
+               }
+
+               for(i=next_index;i<=max_index;i++)
+               {
+                       feature_max[i]=max(feature_max[i],0);
+                       feature_min[i]=min(feature_min[i],0);
+               }
+       }
+
+       rewind(fp);
+
+       /* pass 2.5: save/restore feature_min/feature_max */
+
+       if(restore_filename)
+       {
+               /* fp_restore rewinded in finding max_index */
+               int idx, c;
+               double fmin, fmax;
+               int next_index = 1;
+
+               if((c = fgetc(fp_restore)) == 'y')
+               {
+                       if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
+                          fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
+                               return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
+                       y_scaling = 1;
+               }
+               else
+                       ungetc(c, fp_restore);
+
+               if (fgetc(fp_restore) == 'x')
+               {
+                       if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
+                               return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
+                       while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
+                       {
+                               for(i = next_index;i<idx;i++)
+                                       if(feature_min[i] != feature_max[i])
+                                       {
+                                               fprintf(stderr,
+                                                       "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
+                                                       i, argv[argc-1], restore_filename);
+                                               feature_min[i] = 0;
+                                               feature_max[i] = 0;
+                                       }
+
+                               feature_min[idx] = fmin;
+                               feature_max[idx] = fmax;
+
+                               next_index = idx + 1;
+                       }
+
+                       for(i=next_index;i<=max_index;i++)
+                               if(feature_min[i] != feature_max[i])
+                               {
+                                       fprintf(stderr,
+                                               "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
+                                               i, argv[argc-1], restore_filename);
+                                       feature_min[i] = 0;
+                                       feature_max[i] = 0;
+                               }
+               }
+               fclose(fp_restore);
+       }
+
+       if(save_filename)
+       {
+               FILE *fp_save = fopen(save_filename,"w");
+               if(fp_save==NULL)
+               {
+                       fprintf(stderr,"can't open file %s\n", save_filename);
+                       exit(1);
+               }
+               if(y_scaling)
+               {
+                       fprintf(fp_save, "y\n");
+                       fprintf(fp_save, "%.17g %.17g\n", y_lower, y_upper);
+                       fprintf(fp_save, "%.17g %.17g\n", y_min, y_max);
+               }
+               fprintf(fp_save, "x\n");
+               fprintf(fp_save, "%.17g %.17g\n", lower, upper);
+               for(i=1;i<=max_index;i++)
+               {
+                       if(feature_min[i]!=feature_max[i])
+                               fprintf(fp_save,"%d %.17g %.17g\n",i,feature_min[i],feature_max[i]);
+               }
+
+               if(min_index < 1)
+                       fprintf(stderr,
+                               "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
+
+               fclose(fp_save);
+       }
+
+       /* pass 3: scale */
+       while(readline(fp)!=NULL)
+       {
+               char *p=line;
+               int next_index=1;
+               double target;
+               double value;
+
+               if (sscanf(p,"%lf",&target) != 1)
+                       return clean_up(NULL, fp, "ERROR: failed to read labels\n");
+               output_target(target);
+
+               SKIP_TARGET
+
+               while(sscanf(p,"%d:%lf",&index,&value)==2)
+               {
+                       for(i=next_index;i<index;i++)
+                               output(i,0);
+
+                       output(index,value);
+
+                       SKIP_ELEMENT
+                       next_index=index+1;
+               }
+
+               for(i=next_index;i<=max_index;i++)
+                       output(i,0);
+
+               printf("\n");
+       }
+
+       if (new_num_nonzeros > num_nonzeros)
+               fprintf(stderr,
+                       "WARNING: original #nonzeros %ld\n"
+                       "       > new      #nonzeros %ld\n"
+                       "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
+                       num_nonzeros, new_num_nonzeros);
+
+       free(line);
+       free(feature_max);
+       free(feature_min);
+       fclose(fp);
+       return 0;
+}
+
+char* readline(FILE *input)
+{
+       int len;
+
+       if(fgets(line,max_line_len,input) == NULL)
+               return NULL;
+
+       while(strrchr(line,'\n') == NULL)
+       {
+               max_line_len *= 2;
+               line = (char *) realloc(line, max_line_len);
+               len = (int) strlen(line);
+               if(fgets(line+len,max_line_len-len,input) == NULL)
+                       break;
+       }
+       return line;
+}
+
+void output_target(double value)
+{
+       if(y_scaling)
+       {
+               if(value == y_min)
+                       value = y_lower;
+               else if(value == y_max)
+                       value = y_upper;
+               else value = y_lower + (y_upper-y_lower) *
+                            (value - y_min)/(y_max-y_min);
+       }
+       printf("%.17g ",value);
+}
+
+void output(int index, double value)
+{
+       /* skip single-valued attribute */
+       if(feature_max[index] == feature_min[index])
+               return;
+
+       if(value == feature_min[index])
+               value = lower;
+       else if(value == feature_max[index])
+               value = upper;
+       else
+               value = lower + (upper-lower) *
+                       (value-feature_min[index])/
+                       (feature_max[index]-feature_min[index]);
+
+       if(value != 0)
+       {
+               printf("%d:%g ",index, value);
+               new_num_nonzeros++;
+       }
+}
+
+int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
+{
+       fprintf(stderr, "%s", msg);
+       free(line);
+       free(feature_max);
+       free(feature_min);
+       fclose(fp);
+       if (fp_restore)
+               fclose(fp_restore);
+       return -1;
+}
+
author	Chih-Jen Lin <cjlin@csie.ntu.edu.tw>
	Thu, 26 Jul 2018 15:11:23 +0000 (23:11 +0800)
committer	Chih-Jen Lin <cjlin@csie.ntu.edu.tw>
	Thu, 26 Jul 2018 15:11:23 +0000 (23:11 +0800)
README		patch \| blob \| history
python/README		patch \| blob \| history
python/commonutil.py	[new file with mode: 0644]	patch \| blob
python/liblinearutil.py		patch \| blob \| history
svm-scale.c	[new file with mode: 0644]	patch \| blob