From: ppetter1025 <peter58972@gmail.com>
Date: Mon, 10 Sep 2018 12:24:54 +0000 (+0800)
Subject: Changes due to a modification of the same file in LIBSVM:
X-Git-Tag: v221~3
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ceac30a68721637baeb960ab2680162229b8e540;p=liblinear

Changes due to a modification of the same file in LIBSVM:

	use array for reading data in python/commonutil.py to lower the memory usage
---

diff --git a/python/commonutil.py b/python/commonutil.py
index f34b887..881f6bb 100644
--- a/python/commonutil.py
+++ b/python/commonutil.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 from __future__ import print_function
+from array import array
 import sys
 
 try:
@@ -21,17 +22,23 @@ def svm_read_problem(data_file_name, return_scipy=False):
 	Read LIBSVM-format data from data_file_name and return labels y
 	and data instances x.
 	"""
-	prob_y = []
-	prob_x = []
-	row_ptr = [0]
-	col_idx = []
+	if scipy != None and return_scipy:
+		prob_y = array('d')
+		prob_x = array('d')
+		row_ptr = array('l', [0])
+		col_idx = array('l')
+	else:
+		prob_y = []
+		prob_x = []
+		row_ptr = [0]
+		col_idx = []
 	indx_start = 1
 	for i, line in enumerate(open(data_file_name)):
 		line = line.split(None, 1)
 		# In case an instance with all zero features
 		if len(line) == 1: line += ['']
 		label, features = line
-		prob_y += [float(label)]
+		prob_y.append(float(label))
 		if scipy != None and return_scipy:
 			nz = 0
 			for e in features.split():
@@ -40,10 +47,10 @@ def svm_read_problem(data_file_name, return_scipy=False):
 					indx_start = 0
 				val = float(val)
 				if val != 0:
-					col_idx += [int(ind)-indx_start]
-					prob_x += [val]
+					col_idx.append(int(ind)-indx_start)
+					prob_x.append(val)
 					nz += 1
-			row_ptr += [row_ptr[-1]+nz]
+			row_ptr.append(row_ptr[-1]+nz)
 		else:
 			xi = {}
 			for e in features.split():
@@ -51,10 +58,10 @@ def svm_read_problem(data_file_name, return_scipy=False):
 				xi[int(ind)] = float(val)
 			prob_x += [xi]
 	if scipy != None and return_scipy:
-		prob_y = scipy.array(prob_y)
-		prob_x = scipy.array(prob_x)
-		col_idx = scipy.array(col_idx)
-		row_ptr = scipy.array(row_ptr)
+		prob_y = scipy.frombuffer(prob_y, dtype='d')
+		prob_x = scipy.frombuffer(prob_x, dtype='d')
+		col_idx = scipy.frombuffer(col_idx, dtype='l')
+		row_ptr = scipy.frombuffer(row_ptr, dtype='l')
 		prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
 	return (prob_y, prob_x)