]> granicus.if.org Git - liblinear/blob - svm-scale.c
Cross compilation fix
[liblinear] / svm-scale.c
1 #include <float.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <ctype.h>
5 #include <string.h>
6
7 void exit_with_help()
8 {
9         printf(
10         "Usage: svm-scale [options] data_filename\n"
11         "options:\n"
12         "-l lower : x scaling lower limit (default -1)\n"
13         "-u upper : x scaling upper limit (default +1)\n"
14         "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
15         "-s save_filename : save scaling parameters to save_filename\n"
16         "-r restore_filename : restore scaling parameters from restore_filename\n"
17         );
18         exit(1);
19 }
20
21 char *line = NULL;
22 int max_line_len = 1024;
23 double lower=-1.0,upper=1.0,y_lower,y_upper;
24 int y_scaling = 0;
25 double *feature_max;
26 double *feature_min;
27 double y_max = -DBL_MAX;
28 double y_min = DBL_MAX;
29 int max_index;
30 int min_index;
31 long int num_nonzeros = 0;
32 long int new_num_nonzeros = 0;
33
34 #define max(x,y) (((x)>(y))?(x):(y))
35 #define min(x,y) (((x)<(y))?(x):(y))
36
37 void output_target(double value);
38 void output(int index, double value);
39 char* readline(FILE *input);
40 int clean_up(FILE *fp_restore, FILE *fp, const char *msg);
41
42 int main(int argc,char **argv)
43 {
44         int i,index;
45         FILE *fp, *fp_restore = NULL;
46         char *save_filename = NULL;
47         char *restore_filename = NULL;
48
49         for(i=1;i<argc;i++)
50         {
51                 if(argv[i][0] != '-') break;
52                 ++i;
53                 switch(argv[i-1][1])
54                 {
55                         case 'l': lower = atof(argv[i]); break;
56                         case 'u': upper = atof(argv[i]); break;
57                         case 'y':
58                                 y_lower = atof(argv[i]);
59                                 ++i;
60                                 y_upper = atof(argv[i]);
61                                 y_scaling = 1;
62                                 break;
63                         case 's': save_filename = argv[i]; break;
64                         case 'r': restore_filename = argv[i]; break;
65                         default:
66                                 fprintf(stderr,"unknown option\n");
67                                 exit_with_help();
68                 }
69         }
70
71         if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
72         {
73                 fprintf(stderr,"inconsistent lower/upper specification\n");
74                 exit(1);
75         }
76
77         if(restore_filename && save_filename)
78         {
79                 fprintf(stderr,"cannot use -r and -s simultaneously\n");
80                 exit(1);
81         }
82
83         if(argc != i+1)
84                 exit_with_help();
85
86         fp=fopen(argv[i],"r");
87
88         if(fp==NULL)
89         {
90                 fprintf(stderr,"can't open file %s\n", argv[i]);
91                 exit(1);
92         }
93
94         line = (char *) malloc(max_line_len*sizeof(char));
95
96 #define SKIP_TARGET\
97         while(isspace(*p)) ++p;\
98         while(!isspace(*p)) ++p;
99
100 #define SKIP_ELEMENT\
101         while(*p!=':') ++p;\
102         ++p;\
103         while(isspace(*p)) ++p;\
104         while(*p && !isspace(*p)) ++p;
105
106         /* assumption: min index of attributes is 1 */
107         /* pass 1: find out max index of attributes */
108         max_index = 0;
109         min_index = 1;
110
111         if(restore_filename)
112         {
113                 int idx, c;
114
115                 fp_restore = fopen(restore_filename,"r");
116                 if(fp_restore==NULL)
117                 {
118                         fprintf(stderr,"can't open file %s\n", restore_filename);
119                         exit(1);
120                 }
121
122                 c = fgetc(fp_restore);
123                 if(c == 'y')
124                 {
125                         readline(fp_restore);
126                         readline(fp_restore);
127                         readline(fp_restore);
128                 }
129                 readline(fp_restore);
130                 readline(fp_restore);
131
132                 while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
133                         max_index = max(idx,max_index);
134                 rewind(fp_restore);
135         }
136
137         while(readline(fp)!=NULL)
138         {
139                 char *p=line;
140
141                 SKIP_TARGET
142
143                 while(sscanf(p,"%d:%*f",&index)==1)
144                 {
145                         max_index = max(max_index, index);
146                         min_index = min(min_index, index);
147                         SKIP_ELEMENT
148                         num_nonzeros++;
149                 }
150         }
151
152         if(min_index < 1)
153                 fprintf(stderr,
154                         "WARNING: minimal feature index is %d, but indices should start from 1\n", min_index);
155
156         rewind(fp);
157
158         feature_max = (double *)malloc((max_index+1)* sizeof(double));
159         feature_min = (double *)malloc((max_index+1)* sizeof(double));
160
161         if(feature_max == NULL || feature_min == NULL)
162         {
163                 fprintf(stderr,"can't allocate enough memory\n");
164                 exit(1);
165         }
166
167         for(i=0;i<=max_index;i++)
168         {
169                 feature_max[i]=-DBL_MAX;
170                 feature_min[i]=DBL_MAX;
171         }
172
173         /* pass 2: find out min/max value */
174         while(readline(fp)!=NULL)
175         {
176                 char *p=line;
177                 int next_index=1;
178                 double target;
179                 double value;
180
181                 if (sscanf(p,"%lf",&target) != 1)
182                         return clean_up(fp_restore, fp, "ERROR: failed to read labels\n");
183                 y_max = max(y_max,target);
184                 y_min = min(y_min,target);
185
186                 SKIP_TARGET
187
188                 while(sscanf(p,"%d:%lf",&index,&value)==2)
189                 {
190                         for(i=next_index;i<index;i++)
191                         {
192                                 feature_max[i]=max(feature_max[i],0);
193                                 feature_min[i]=min(feature_min[i],0);
194                         }
195
196                         feature_max[index]=max(feature_max[index],value);
197                         feature_min[index]=min(feature_min[index],value);
198
199                         SKIP_ELEMENT
200                         next_index=index+1;
201                 }
202
203                 for(i=next_index;i<=max_index;i++)
204                 {
205                         feature_max[i]=max(feature_max[i],0);
206                         feature_min[i]=min(feature_min[i],0);
207                 }
208         }
209
210         rewind(fp);
211
212         /* pass 2.5: save/restore feature_min/feature_max */
213
214         if(restore_filename)
215         {
216                 /* fp_restore rewinded in finding max_index */
217                 int idx, c;
218                 double fmin, fmax;
219                 int next_index = 1;
220
221                 if((c = fgetc(fp_restore)) == 'y')
222                 {
223                         if(fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper) != 2 ||
224                            fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max) != 2)
225                                 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
226                         y_scaling = 1;
227                 }
228                 else
229                         ungetc(c, fp_restore);
230
231                 if (fgetc(fp_restore) == 'x')
232                 {
233                         if(fscanf(fp_restore, "%lf %lf\n", &lower, &upper) != 2)
234                                 return clean_up(fp_restore, fp, "ERROR: failed to read scaling parameters\n");
235                         while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
236                         {
237                                 for(i = next_index;i<idx;i++)
238                                         if(feature_min[i] != feature_max[i])
239                                         {
240                                                 fprintf(stderr,
241                                                         "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
242                                                         i, argv[argc-1], restore_filename);
243                                                 feature_min[i] = 0;
244                                                 feature_max[i] = 0;
245                                         }
246
247                                 feature_min[idx] = fmin;
248                                 feature_max[idx] = fmax;
249
250                                 next_index = idx + 1;
251                         }
252
253                         for(i=next_index;i<=max_index;i++)
254                                 if(feature_min[i] != feature_max[i])
255                                 {
256                                         fprintf(stderr,
257                                                 "WARNING: feature index %d appeared in file %s was not seen in the scaling factor file %s. The feature is scaled to 0.\n",
258                                                 i, argv[argc-1], restore_filename);
259                                         feature_min[i] = 0;
260                                         feature_max[i] = 0;
261                                 }
262                 }
263                 fclose(fp_restore);
264         }
265
266         if(save_filename)
267         {
268                 FILE *fp_save = fopen(save_filename,"w");
269                 if(fp_save==NULL)
270                 {
271                         fprintf(stderr,"can't open file %s\n", save_filename);
272                         exit(1);
273                 }
274                 if(y_scaling)
275                 {
276                         fprintf(fp_save, "y\n");
277                         fprintf(fp_save, "%.17g %.17g\n", y_lower, y_upper);
278                         fprintf(fp_save, "%.17g %.17g\n", y_min, y_max);
279                 }
280                 fprintf(fp_save, "x\n");
281                 fprintf(fp_save, "%.17g %.17g\n", lower, upper);
282                 for(i=1;i<=max_index;i++)
283                 {
284                         if(feature_min[i]!=feature_max[i])
285                                 fprintf(fp_save,"%d %.17g %.17g\n",i,feature_min[i],feature_max[i]);
286                 }
287
288                 if(min_index < 1)
289                         fprintf(stderr,
290                                 "WARNING: scaling factors with indices smaller than 1 are not stored to the file %s.\n", save_filename);
291
292                 fclose(fp_save);
293         }
294
295         /* pass 3: scale */
296         while(readline(fp)!=NULL)
297         {
298                 char *p=line;
299                 int next_index=1;
300                 double target;
301                 double value;
302
303                 if (sscanf(p,"%lf",&target) != 1)
304                         return clean_up(NULL, fp, "ERROR: failed to read labels\n");
305                 output_target(target);
306
307                 SKIP_TARGET
308
309                 while(sscanf(p,"%d:%lf",&index,&value)==2)
310                 {
311                         for(i=next_index;i<index;i++)
312                                 output(i,0);
313
314                         output(index,value);
315
316                         SKIP_ELEMENT
317                         next_index=index+1;
318                 }
319
320                 for(i=next_index;i<=max_index;i++)
321                         output(i,0);
322
323                 printf("\n");
324         }
325
326         if (new_num_nonzeros > num_nonzeros)
327                 fprintf(stderr,
328                         "WARNING: original #nonzeros %ld\n"
329                         "       > new      #nonzeros %ld\n"
330                         "If feature values are non-negative and sparse, use -l 0 rather than the default -l -1\n",
331                         num_nonzeros, new_num_nonzeros);
332
333         free(line);
334         free(feature_max);
335         free(feature_min);
336         fclose(fp);
337         return 0;
338 }
339
340 char* readline(FILE *input)
341 {
342         int len;
343
344         if(fgets(line,max_line_len,input) == NULL)
345                 return NULL;
346
347         while(strrchr(line,'\n') == NULL)
348         {
349                 max_line_len *= 2;
350                 line = (char *) realloc(line, max_line_len);
351                 len = (int) strlen(line);
352                 if(fgets(line+len,max_line_len-len,input) == NULL)
353                         break;
354         }
355         return line;
356 }
357
358 void output_target(double value)
359 {
360         if(y_scaling)
361         {
362                 if(value == y_min)
363                         value = y_lower;
364                 else if(value == y_max)
365                         value = y_upper;
366                 else value = y_lower + (y_upper-y_lower) *
367                              (value - y_min)/(y_max-y_min);
368         }
369         printf("%.17g ",value);
370 }
371
372 void output(int index, double value)
373 {
374         /* skip single-valued attribute */
375         if(feature_max[index] == feature_min[index])
376                 return;
377
378         if(value == feature_min[index])
379                 value = lower;
380         else if(value == feature_max[index])
381                 value = upper;
382         else
383                 value = lower + (upper-lower) *
384                         (value-feature_min[index])/
385                         (feature_max[index]-feature_min[index]);
386
387         if(value != 0)
388         {
389                 printf("%d:%g ",index, value);
390                 new_num_nonzeros++;
391         }
392 }
393
394 int clean_up(FILE *fp_restore, FILE *fp, const char* msg)
395 {
396         fprintf(stderr, "%s", msg);
397         free(line);
398         free(feature_max);
399         free(feature_min);
400         fclose(fp);
401         if (fp_restore)
402                 fclose(fp_restore);
403         return -1;
404 }
405