From: ppetter1025 Date: Mon, 10 Sep 2018 12:24:54 +0000 (+0800) Subject: Changes due to a modification of the same file in LIBSVM: X-Git-Tag: v221~3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ceac30a68721637baeb960ab2680162229b8e540;p=liblinear Changes due to a modification of the same file in LIBSVM: use array for reading data in python/commonutil.py to lower the memory usage --- diff --git a/python/commonutil.py b/python/commonutil.py index f34b887..881f6bb 100644 --- a/python/commonutil.py +++ b/python/commonutil.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import print_function +from array import array import sys try: @@ -21,17 +22,23 @@ def svm_read_problem(data_file_name, return_scipy=False): Read LIBSVM-format data from data_file_name and return labels y and data instances x. """ - prob_y = [] - prob_x = [] - row_ptr = [0] - col_idx = [] + if scipy != None and return_scipy: + prob_y = array('d') + prob_x = array('d') + row_ptr = array('l', [0]) + col_idx = array('l') + else: + prob_y = [] + prob_x = [] + row_ptr = [0] + col_idx = [] indx_start = 1 for i, line in enumerate(open(data_file_name)): line = line.split(None, 1) # In case an instance with all zero features if len(line) == 1: line += [''] label, features = line - prob_y += [float(label)] + prob_y.append(float(label)) if scipy != None and return_scipy: nz = 0 for e in features.split(): @@ -40,10 +47,10 @@ def svm_read_problem(data_file_name, return_scipy=False): indx_start = 0 val = float(val) if val != 0: - col_idx += [int(ind)-indx_start] - prob_x += [val] + col_idx.append(int(ind)-indx_start) + prob_x.append(val) nz += 1 - row_ptr += [row_ptr[-1]+nz] + row_ptr.append(row_ptr[-1]+nz) else: xi = {} for e in features.split(): @@ -51,10 +58,10 @@ def svm_read_problem(data_file_name, return_scipy=False): xi[int(ind)] = float(val) prob_x += [xi] if scipy != None and return_scipy: - prob_y = scipy.array(prob_y) - prob_x = scipy.array(prob_x) - col_idx = scipy.array(col_idx) - row_ptr = scipy.array(row_ptr) + prob_y = scipy.frombuffer(prob_y, dtype='d') + prob_x = scipy.frombuffer(prob_x, dtype='d') + col_idx = scipy.frombuffer(col_idx, dtype='l') + row_ptr = scipy.frombuffer(row_ptr, dtype='l') prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) return (prob_y, prob_x)