At the moment, I am using h5py to generate hdf5 datasets. I have something like this
import h5py
import numpy as np
my_data=np.genfromtxt("/tmp/data.csv",delimiter=",",dtype=None,names=True)
myFile="/tmp/f.hdf"
with h5py.File(myFile,"a") as f:
dset = f.create_dataset('%s/%s'%(vendor,dataSet),data=my_data,compression="gzip",compression_opts=9)
This works well for a relatively large ASCII file (400MB). I would like to do the same for a even larger dataset (40GB). Is there a better or more efficient way to do this with h5py? I want to avoid loading the entire data set into memory.
Some information about the data:
dtype=None
from np.loadtxt()
You could infer the dtypes of your data by reading a smaller chunk of rows at the start of the text file. Once you have these, you can create a resizable HDF5 dataset and iteratively write chunks of rows from your text file to it.
Here's a generator that yields successive chunks of rows from a text file as numpy arrays:
import numpy as np
import warnings
def iter_genfromtxt(path, chunksize=100, **kwargs):
"""Yields consecutive chunks of rows from a text file as numpy arrays.
Args:
path: Path to the text file.
chunksize: Maximum number of rows to yield at a time.
**kwargs: Additional keyword arguments are passed to `np.genfromtxt`,
with the exception of `skip_footer` which is unsupported.
Yields:
A sequence of `np.ndarray`s with a maximum row dimension of `chunksize`.
"""
names = kwargs.pop('names', None)
max_rows = kwargs.pop('max_rows', None)
skip_header = kwargs.pop('skip_header', kwargs.pop('skiprows', 0))
if kwargs.pop('skip_footer', None) is not None:
warnings.warn('`skip_footer` will be ignored')
with open(path, 'rb') as f:
# The first chunk is handled separately, since we may wish to skip rows,
# read column headers etc.
chunk = np.genfromtxt(f, max_rows=chunksize, skip_header=skip_header,
names=names, **kwargs)
# Ensure that subsequent chunks have consistent dtypes and field names
kwargs.update({'dtype':chunk.dtype})
while len(chunk):
yield chunk[:max_rows]
if max_rows is not None:
max_rows -= len(chunk)
if max_rows <= 0:
raise StopIteration
chunk = np.genfromtxt(f, max_rows=chunksize, **kwargs)
Now suppose we have a .csv
file containing:
strings,ints,floats
a,1,0.1256290043
b,2,0.0071402451
c,3,0.2551627907
d,4,0.7958570533
e,5,0.8968247722
f,6,0.7291124437
g,7,0.4196829806
h,8,0.398944394
i,9,0.8718244087
j,10,0.67605461
k,11,0.7105670336
l,12,0.6341504091
m,13,0.1324232855
n,14,0.7062503808
o,15,0.1915132527
p,16,0.4140093777
q,17,0.1458217602
r,18,0.1183596433
s,19,0.0014556247
t,20,0.1649811301
We can read this data in chunks of 5 rows at a time, and write the resulting arrays to a resizeable dataset:
import h5py
# Initialize the generator
gen = iter_genfromtxt('/tmp/test.csv', chunksize=5, delimiter=',', names=True,
dtype=None)
# Read the first chunk to get the column dtypes
chunk = next(gen)
dtype = chunk.dtype
row_count = chunk.shape[0]
with h5py.File('/tmp/test.h5', 'w') as f:
# Initialize a resizable dataset to hold the output
maxshape = (None,) + chunk.shape[1:]
dset = f.create_dataset('data', shape=chunk.shape, maxshape=maxshape,
chunks=chunk.shape, dtype=chunk.dtype)
# Write the first chunk of rows
dset[:] = chunk
for chunk in gen:
# Resize the dataset to accommodate the next chunk of rows
dset.resize(row_count + chunk.shape[0], axis=0)
# Write the next chunk
dset[row_count:] = chunk
# Increment the row count
row_count += chunk.shape[0]
Output:
with h5py.File('/tmp/test.h5', 'r') as f:
print(repr(f['data'][:]))
# array([(b'a', 1, 0.1256290043), (b'b', 2, 0.0071402451),
# (b'c', 3, 0.2551627907), (b'd', 4, 0.7958570533),
# (b'e', 5, 0.8968247722), (b'f', 6, 0.7291124437),
# (b'g', 7, 0.4196829806), (b'h', 8, 0.398944394),
# (b'i', 9, 0.8718244087), (b'j', 10, 0.67605461),
# (b'k', 11, 0.7105670336), (b'l', 12, 0.6341504091),
# (b'm', 13, 0.1324232855), (b'n', 14, 0.7062503808),
# (b'o', 15, 0.1915132527), (b'p', 16, 0.4140093777),
# (b'q', 17, 0.1458217602), (b'r', 18, 0.1183596433),
# (b's', 19, 0.0014556247), (b't', 20, 0.1649811301)],
# dtype=[('strings', 'S1'), ('ints', '<i8'), ('floats', '<f8')])
For your dataset you will probably want to use a larger chunksize.