3 from __future__ import print_function
8 from scipy import sparse
14 __all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
16 def svm_read_problem(data_file_name, return_scipy=False):
18 svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
19 svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix
21 Read LIBSVM-format data from data_file_name and return labels y
28 for i, line in enumerate(open(data_file_name)):
29 line = line.split(None, 1)
30 # In case an instance with all zero features
31 if len(line) == 1: line += ['']
32 label, features = line
33 prob_y += [float(label)]
34 if scipy != None and return_scipy:
36 for e in features.split():
37 ind, val = e.split(":")
40 col_idx += [int(ind)-1]
43 row_ptr += [row_ptr[-1]+nz]
46 for e in features.split():
47 ind, val = e.split(":")
48 xi[int(ind)] = float(val)
50 if scipy != None and return_scipy:
51 prob_y = scipy.array(prob_y)
52 prob_x = scipy.array(prob_x)
53 col_idx = scipy.array(col_idx)
54 row_ptr = scipy.array(row_ptr)
55 prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
56 return (prob_y, prob_x)
58 def evaluations_scipy(ty, pv):
60 evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
63 Calculate accuracy, mean squared error and squared correlation coefficient
64 using the true values (ty) and predicted values (pv).
66 if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
67 raise TypeError("type of ty and pv must be ndarray")
68 if len(ty) != len(pv):
69 raise ValueError("len(ty) must be equal to len(pv)")
70 ACC = 100.0*(ty == pv).mean()
71 MSE = ((ty - pv)**2).mean()
78 with scipy.errstate(all = 'raise'):
80 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
83 return (float(ACC), float(MSE), float(SCC))
85 def evaluations(ty, pv, useScipy = True):
87 evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
88 ty, pv: list, tuple or ndarray
89 useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
91 Calculate accuracy, mean squared error and squared correlation coefficient
92 using the true values (ty) and predicted values (pv).
94 if scipy != None and useScipy:
95 return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
96 if len(ty) != len(pv):
97 raise ValueError("len(ty) must be equal to len(pv)")
98 total_correct = total_error = 0
99 sumv = sumy = sumvv = sumyy = sumvy = 0
100 for v, y in zip(pv, ty):
103 total_error += (v-y)*(v-y)
110 ACC = 100.0*total_correct/l
113 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
116 return (float(ACC), float(MSE), float(SCC))
118 def csr_find_scale_param(x, lower=-1, upper=1):
119 assert isinstance(x, sparse.csr_matrix)
122 feat_min = x.min(axis=0).toarray().flatten()
123 feat_max = x.max(axis=0).toarray().flatten()
124 coef = (feat_max - feat_min) / (upper - lower)
125 coef[coef != 0] = 1.0 / coef[coef != 0]
127 # (x - ones(l,1) * feat_min') * diag(coef) + lower
128 # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
129 # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
130 # = x * diag(coef) + ones(l, 1) * offset'
131 offset = -feat_min * coef + lower
132 offset[coef == 0] = 0
134 if sum(offset != 0) * l > 3 * x.getnnz():
136 "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
137 "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
140 return {'coef':coef, 'offset':offset}
142 def csr_scale(x, scale_param):
143 assert isinstance(x, sparse.csr_matrix)
145 offset = scale_param['offset']
146 coef = scale_param['coef']
147 assert len(coef) == len(offset)
151 if not n == len(coef):
152 print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
153 coef = resize(coef, n)
154 offset = resize(offset, n)
156 # scaled_x = x * diag(coef) + ones(l, 1) * offset'
157 offset = sparse.csr_matrix(offset.reshape(1, n))
158 offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
159 scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
161 if scaled_x.getnnz() > x.getnnz():
163 "WARNING: original #nonzeros %d\n" % x.getnnz() +
164 " > new #nonzeros %d\n" % scaled_x.getnnz() +
165 "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",