granicus.if.org Git - liblinear/blob - python/commonutil.py

   1 #!/usr/bin/env python
   2
   3 from __future__ import print_function
   4 import sys
   5
   6 try:
   7         import scipy
   8         from scipy import sparse
   9 except:
  10         scipy = None
  11         sparse = None
  12
  13
  14 __all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
  15
  16 def svm_read_problem(data_file_name, return_scipy=False):
  17         """
  18         svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
  19         svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
  20
  21         Read LIBSVM-format data from data_file_name and return labels y
  22         and data instances x.
  23         """
  24         prob_y = []
  25         prob_x = []
  26         row_ptr = [0]
  27         col_idx = []
  28         for i, line in enumerate(open(data_file_name)):
  29                 line = line.split(None, 1)
  30                 # In case an instance with all zero features
  31                 if len(line) == 1: line += ['']
  32                 label, features = line
  33                 prob_y += [float(label)]
  34                 if scipy != None and return_scipy:
  35                         nz = 0
  36                         for e in features.split():
  37                                 ind, val = e.split(":")
  38                                 val = float(val)
  39                                 if val != 0:
  40                                         col_idx += [int(ind)-1]
  41                                         prob_x += [val]
  42                                         nz += 1
  43                         row_ptr += [row_ptr[-1]+nz]
  44                 else:
  45                         xi = {}
  46                         for e in features.split():
  47                                 ind, val = e.split(":")
  48                                 xi[int(ind)] = float(val)
  49                         prob_x += [xi]
  50         if scipy != None and return_scipy:
  51                 prob_y = scipy.array(prob_y)
  52                 prob_x = scipy.array(prob_x)
  53                 col_idx = scipy.array(col_idx)
  54                 row_ptr = scipy.array(row_ptr)
  55                 prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
  56         return (prob_y, prob_x)
  57
  58 def evaluations_scipy(ty, pv):
  59         """
  60         evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
  61         ty, pv: ndarray
  62
  63         Calculate accuracy, mean squared error and squared correlation coefficient
  64         using the true values (ty) and predicted values (pv).
  65         """
  66         if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
  67                 raise TypeError("type of ty and pv must be ndarray")
  68         if len(ty) != len(pv):
  69                 raise ValueError("len(ty) must be equal to len(pv)")
  70         ACC = 100.0*(ty == pv).mean()
  71         MSE = ((ty - pv)**2).mean()
  72         l = len(ty)
  73         sumv = pv.sum()
  74         sumy = ty.sum()
  75         sumvy = (pv*ty).sum()
  76         sumvv = (pv*pv).sum()
  77         sumyy = (ty*ty).sum()
  78         with scipy.errstate(all = 'raise'):
  79                 try:
  80                         SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
  81                 except:
  82                         SCC = float('nan')
  83         return (float(ACC), float(MSE), float(SCC))
  84
  85 def evaluations(ty, pv, useScipy = True):
  86         """
  87         evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
  88         ty, pv: list, tuple or ndarray
  89         useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
  90
  91         Calculate accuracy, mean squared error and squared correlation coefficient
  92         using the true values (ty) and predicted values (pv).
  93         """
  94         if scipy != None and useScipy:
  95                 return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
  96         if len(ty) != len(pv):
  97                 raise ValueError("len(ty) must be equal to len(pv)")
  98         total_correct = total_error = 0
  99         sumv = sumy = sumvv = sumyy = sumvy = 0
 100         for v, y in zip(pv, ty):
 101                 if y == v:
 102                         total_correct += 1
 103                 total_error += (v-y)*(v-y)
 104                 sumv += v
 105                 sumy += y
 106                 sumvv += v*v
 107                 sumyy += y*y
 108                 sumvy += v*y
 109         l = len(ty)
 110         ACC = 100.0*total_correct/l
 111         MSE = total_error/l
 112         try:
 113                 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
 114         except:
 115                 SCC = float('nan')
 116         return (float(ACC), float(MSE), float(SCC))
 117
 118 def csr_find_scale_param(x, lower=-1, upper=1):
 119         assert isinstance(x, sparse.csr_matrix)
 120         assert lower < upper
 121         l, n = x.shape
 122         feat_min = x.min(axis=0).toarray().flatten()
 123         feat_max = x.max(axis=0).toarray().flatten()
 124         coef = (feat_max - feat_min) / (upper - lower)
 125         coef[coef != 0] = 1.0 / coef[coef != 0]
 126
 127         # (x - ones(l,1) * feat_min') * diag(coef) + lower
 128         # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
 129         # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
 130         # = x * diag(coef) + ones(l, 1) * offset'
 131         offset = -feat_min * coef + lower
 132         offset[coef == 0] = 0
 133
 134         if sum(offset != 0) * l > 3 * x.getnnz():
 135                 print(
 136                         "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
 137                         "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
 138                         file=sys.stderr)
 139
 140         return {'coef':coef, 'offset':offset}
 141
 142 def csr_scale(x, scale_param):
 143         assert isinstance(x, sparse.csr_matrix)
 144
 145         offset = scale_param['offset']
 146         coef = scale_param['coef']
 147         assert len(coef) == len(offset)
 148
 149         l, n = x.shape
 150
 151         if not n == len(coef):
 152                 print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
 153                 coef = resize(coef, n)
 154                 offset = resize(offset, n)
 155
 156         # scaled_x = x * diag(coef) + ones(l, 1) * offset'
 157         offset = sparse.csr_matrix(offset.reshape(1, n))
 158         offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
 159         scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
 160
 161         if scaled_x.getnnz() > x.getnnz():
 162                 print(
 163                         "WARNING: original #nonzeros %d\n" % x.getnnz() +
 164                         "       > new      #nonzeros %d\n" % scaled_x.getnnz() +
 165                         "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
 166                         file=sys.stderr)