10 "Usage: svm-scale [options] data_filename\n"
12 "-l lower : x scaling lower limit (default -1)\n"
13 "-u upper : x scaling upper limit (default +1)\n"
14 "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
15 "-s save_filename : save scaling parameters to save_filename\n"
16 "-r restore_filename : restore scaling parameters from restore_filename\n"
22 int max_line_len = 1024;
23 double lower=-1.0,upper=1.0,y_lower,y_upper;
27 double y_max = -DBL_MAX;
28 double y_min = DBL_MAX;
31 long int num_nonzeros = 0;
32 long int new_num_nonzeros = 0;
34 #define max(x,y) (((x)>(y))?(x):(y))
35 #define min(x,y) (((x)<(y))?(x):(y))
37 void output_target(double value);
38 void output(int index, double value);
39 char* readline(FILE *input);
40 int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
42 int main(int argc,char **argv)
45 FILE *fp, *fp_restore = NULL;
46 char *save_filename = NULL;
47 char *restore_filename = NULL;
51 if(argv[i][0] != '-') break;
55 case 'l': lower = atof(argv[i]); break;
56 case 'u': upper = atof(argv[i]); break;
58 y_lower = atof(argv[i]);
60 y_upper = atof(argv[i]);
63 case 's': save_filename = argv[i]; break;
64 case 'r': restore_filename = argv[i]; break;
66 fprintf(stderr,"unknown option\n");
71 if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
73 fprintf(stderr,"inconsistent lower/upper specification\n");
77 if(restore_filename && save_filename)
79 fprintf(stderr,"cannot use -r and -s simultaneously\n");
86 fp=fopen(argv[i],"r");
90 fprintf(stderr,"can't open file %s\n", argv[i]);
94 line = (char *) malloc(max_line_len*sizeof(char));
97 while(isspace(*p)) ++p;\
98 while(!isspace(*p)) ++p;
100 #define SKIP_ELEMENT\
103 while(isspace(*p)) ++p;\
104 while(*p && !isspace(*p)) ++p;
106 /* assumption: min index of attributes is 1 */
107 /* pass 1: find out max index of attributes */
115 fp_restore = fopen(restore_filename,"r");
118 fprintf(stderr,"can't open file %s\n", restore_filename);
122 c = fgetc(fp_restore);
125 readline(fp_restore);
126 readline(fp_restore);
127 readline(fp_restore);
129 readline(fp_restore);
130 readline(fp_restore);
132 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
133 max_index = max(idx,max_index);
137 while(readline(fp)!=NULL)
143 while(sscanf(p,"%d:%*f",&index)==1)
145 max_index = max(max_index, index);
146 min_index = min(min_index, index);
154 "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
158 feature_max = (double *)malloc((max_index+1)* sizeof(double));
159 feature_min = (double *)malloc((max_index+1)* sizeof(double));
161 if(feature_max == NULL || feature_min == NULL)
163 fprintf(stderr,"can't allocate enough memory\n");
167 for(i=0;i<=max_index;i++)
169 feature_max[i]=-DBL_MAX;
170 feature_min[i]=DBL_MAX;
173 /* pass 2: find out min/max value */
174 while(readline(fp)!=NULL)
181 if (sscanf(p,"%lf",&target) != 1)
182 return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
183 y_max = max(y_max,target);
184 y_min = min(y_min,target);
188 while(sscanf(p,"%d:%lf",&index,&value)==2)
190 for(i=next_index;i<index;i++)
192 feature_max[i]=max(feature_max[i],0);
193 feature_min[i]=min(feature_min[i],0);
196 feature_max[index]=max(feature_max[index],value);
197 feature_min[index]=min(feature_min[index],value);
203 for(i=next_index;i<=max_index;i++)
205 feature_max[i]=max(feature_max[i],0);
206 feature_min[i]=min(feature_min[i],0);
212 /* pass 2.5: save/restore feature_min/feature_max */
216 /* fp_restore rewinded in finding max_index */
221 if((c = fgetc(fp_restore)) == 'y')
223 if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
224 fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
225 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
229 ungetc(c, fp_restore);
231 if (fgetc(fp_restore) == 'x')
233 if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
234 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
235 while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
237 for(i = next_index;i<idx;i++)
238 if(feature_min[i] != feature_max[i])
241 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
242 i, argv[argc-1], restore_filename);
247 feature_min[idx] = fmin;
248 feature_max[idx] = fmax;
250 next_index = idx + 1;
253 for(i=next_index;i<=max_index;i++)
254 if(feature_min[i] != feature_max[i])
257 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
258 i, argv[argc-1], restore_filename);
268 FILE *fp_save = fopen(save_filename,"w");
271 fprintf(stderr,"can't open file %s\n", save_filename);
276 fprintf(fp_save, "y\n");
277 fprintf(fp_save, "%.17g %.17g\n", y_lower, y_upper);
278 fprintf(fp_save, "%.17g %.17g\n", y_min, y_max);
280 fprintf(fp_save, "x\n");
281 fprintf(fp_save, "%.17g %.17g\n", lower, upper);
282 for(i=1;i<=max_index;i++)
284 if(feature_min[i]!=feature_max[i])
285 fprintf(fp_save,"%d %.17g %.17g\n",i,feature_min[i],feature_max[i]);
290 "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
296 while(readline(fp)!=NULL)
303 if (sscanf(p,"%lf",&target) != 1)
304 return clean_up(NULL, fp, "ERROR: failed to read labels\n");
305 output_target(target);
309 while(sscanf(p,"%d:%lf",&index,&value)==2)
311 for(i=next_index;i<index;i++)
320 for(i=next_index;i<=max_index;i++)
326 if (new_num_nonzeros > num_nonzeros)
328 "WARNING: original #nonzeros %ld\n"
329 " > new #nonzeros %ld\n"
330 "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
331 num_nonzeros, new_num_nonzeros);
340 char* readline(FILE *input)
344 if(fgets(line,max_line_len,input) == NULL)
347 while(strrchr(line,'\n') == NULL)
350 line = (char *) realloc(line, max_line_len);
351 len = (int) strlen(line);
352 if(fgets(line+len,max_line_len-len,input) == NULL)
358 void output_target(double value)
364 else if(value == y_max)
366 else value = y_lower + (y_upper-y_lower) *
367 (value - y_min)/(y_max-y_min);
369 printf("%.17g ",value);
372 void output(int index, double value)
374 /* skip single-valued attribute */
375 if(feature_max[index] == feature_min[index])
378 if(value == feature_min[index])
380 else if(value == feature_max[index])
383 value = lower + (upper-lower) *
384 (value-feature_min[index])/
385 (feature_max[index]-feature_min[index]);
389 printf("%d:%g ",index, value);
394 int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
396 fprintf(stderr, "%s", msg);