Saturday, November 16, 2013

h5py: The resizing overhead

A cool thing about data arrays stored in HDF5 via h5py is that you can incremental add data to them. This is the practical way of processing large data sets: you read in large datasets piece by piece, process them and then append the processed data to an array on disk. An interesting thing about this is that there is a small size overhead in the saved file associated with this resizing compared to the same data saved all at once with no resizing of the HDF5 datasets.
I did the computations by using block sizes of [10, 100, 1000, 10000] elements and block counts of [10,100,1000, 10000] The corresponding matrix of overhead (excess bytes needed for the resized version over the directly saved version) looks like this (rows are for the block sizes, columns are for the block counts):
overhead ->
array([[   9264,    2064,    3792,    9920],
       [   2064,    3792,    9920,   52544],
       [   3792,    9920,   52544,  462744],
       [   9920,   52544,  462744, 4570320]])
As you can see the matrix is symmetric, indicating that block size and resize counts (number of blocks) don't interact and it is only the total final size of the data that matters. Someone with knowledge of the internal of HDF5 and h5py will probably be able to explain why this pattern of overhead occurs and why it only depends on the total size of the data and not on the number of resizes performed. Also, interestingly, the simple act of marking a dataset as resizable adds space overhead to the storage (Third plot). This is a plot of the additional overhead incurred as a result of saving the same sized data set all at once, but marking one as resizable. Sidenote: If you let h5py choose the data type on its own during a dummy initialization it will pick float32 (e.g. f.create_dataset('data',(0,1), maxshape=(None,1) ) Code:
import h5py, numpy, os

def resize_test(block_sizes=[10, 100, 1000, 10000], n_blocks=[10,100,1000, 10000], dtyp=float):
  file_size = numpy.empty((len(block_sizes),len(n_blocks),2),dtype=int)
  for i,bl_sz in enumerate(block_sizes):
    for j,n_bl in enumerate(n_blocks):
      with h5py.File('no_resize.h5','w') as f:
        f.create_dataset('data',data=numpy.empty((bl_sz*n_bl,1)),dtype=dtyp)

      with h5py.File('yes_resize.h5','w') as f:
        f.create_dataset('data',(0,1), maxshape=(None,1),dtype=dtyp)
        for n in range(n_bl):
          f['data'].resize(((n+1)*bl_sz,1))
          f['data'][-bl_sz:,:] = numpy.empty((bl_sz,1),dtype=dtyp)

      with h5py.File('no_resize.h5') as f:
        print f['data'].dtype
        print f['data'].shape
        file_size[i,j,0] = os.fstat(f.fid.get_vfd_handle()).st_size

      with h5py.File('yes_resize.h5') as f:
        print f['data'].dtype
        print f['data'].shape
        file_size[i,j,1] = os.fstat(f.fid.get_vfd_handle()).st_size
  return file_size

def default_test(data_sizes=[100,1000,10000,100000,1000000,10000000,100000000], dtyp=float):
  file_size = numpy.empty((len(data_sizes),2),dtype=int)
  for i,dt_sz in enumerate(data_sizes):
    with h5py.File('no_resize.h5','w') as f:
      f.create_dataset('data',data=numpy.empty((dt_sz,1)),dtype=dtyp)

    with h5py.File('yes_resize.h5','w') as f:
      f.create_dataset('data',data=numpy.empty((dt_sz,1)), maxshape=(None,1),dtype=dtyp)

    with h5py.File('no_resize.h5') as f:
      print f['data'].dtype
      print f['data'].shape
      file_size[i,0] = os.fstat(f.fid.get_vfd_handle()).st_size

    with h5py.File('yes_resize.h5') as f:
      print f['data'].dtype
      print f['data'].shape
      file_size[i,1] = os.fstat(f.fid.get_vfd_handle()).st_size
  return file_size



data = resize_test()
overhead = data[:,:,1] - data[:,:,0]

data2 = default_test()
overhead2 = data2[:,1] - data2[:,0]
And, for those interested, here is the full data matrix (which you can get simply by running the code above):
In [170]: data
Out[170]: 
array([[[     2944,     12208],
        [    10144,     12208],
        [    82144,     85936],
        [   802144,    812064]],

       [[    10144,     12208],
        [    82144,     85936],
        [   802144,    812064],
        [  8002144,   8054688]],

       [[    82144,     85936],
        [   802144,    812064],
        [  8002144,   8054688],
        [ 80002144,  80464888]],

       [[   802144,    812064],
        [  8002144,   8054688],
        [ 80002144,  80464888],
        [800002144, 804572464]]])
And I took a short cut to making the plots by pasting the computed data in:
x1 = [100,1000,10000,100000,1000000,10000000,100000000]
x2 = [2944,     10144,     82144,    802144, 8002144,  80002144, 800002144]
y = [9265,2064,3792,9920,52544,462744,4570320]

x3 = [100,1000,10000,100000,1000000,10000000,100000000]
y3 = overhead2

pylab.figure(figsize=(12,4))
pylab.subplots_adjust(bottom=0.15,left=0.1,right=0.97,top=0.9)

pylab.subplot(1,3,1)
pylab.loglog(x1,y,'k.-',lw=5)
pylab.xlabel('Elements')
pylab.ylabel('Overhead (bytes)')
pylab.title('Resized')
pylab.setp(pylab.gca(),ylim=[1e3,1e7])

pylab.subplot(1,3,2)
pylab.loglog(x2,y,'k.-',lw=5)
pylab.xlabel('File size (bytes)')
pylab.ylabel('Overhead (bytes)')
pylab.title('Resized')
pylab.setp(pylab.gca(),ylim=[1e3,1e7])

pylab.subplot(1,3,3)
pylab.loglog(x3,y3,'k.-',lw=5)
pylab.xlabel('Elements')
pylab.ylabel('Overhead (bytes)')
pylab.title('No actual resizing')
pylab.setp(pylab.gca(),ylim=[1e3,1e7])

No comments:

Post a Comment