Given a vector of strings, what is the best way to write them out to a HDF5 dataset? At the moment I'm doing something like the following:
const unsigned int MaxStrLength = 512;
struct TempContainer {
char string[MaxStrLength];
};
void writeVector (hid_t group, std::vector<std::string> const & v)
{
//
// Firstly copy the contents of the vector into a temporary container
std::vector<TempContainer> tc;
for (std::vector<std::string>::const_iterator i = v.begin ()
, end = v.end ()
; i != end
; ++i)
{
TempContainer t;
strncpy (t.string, i->c_str (), MaxStrLength);
tc.push_back (t);
}
//
// Write the temporary container to a dataset
hsize_t dims[] = { tc.size () } ;
hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
, dims
, NULL);
hid_t strtype = H5Tcopy (H5T_C_S1);
H5Tset_size (strtype, MaxStrLength);
hid_t datatype = H5Tcreate (H5T_COMPOUND, sizeof (TempConainer));
H5Tinsert (datatype
, "string"
, HOFFSET(TempContainer, string)
, strtype);
hid_t dataset = H5Dcreate1 (group
, "files"
, datatype
, dataspace
, H5P_DEFAULT);
H5Dwrite (dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &tc[0] );
H5Dclose (dataset);
H5Sclose (dataspace);
H5Tclose (strtype);
H5Tclose (datatype);
}
At a minimum, I would really like to change the above so that:
I have no restrictions over how I store the data so for example, it doesn't have to be a COMPOUND datatype if there is a better way to do this.
EDIT: Just to narrow the problem down, I'm relatively familiar with playing with the data on the C++ side, it's the HDF5 side where I need most of the help.
Thanks for your help.
[Many thanks to dirkgently for his help in answering this.]
To write a variable length string in HDF5 use the following:
// Create the datatype as follows
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);
//
// Pass the string to be written to H5Dwrite
// using the address of the pointer!
const char * s = v.c_str ();
H5Dwrite (dataset
, datatype
, H5S_ALL
, H5S_ALL
, H5P_DEFAULT
, &s );
One solution for writing a container is to write each element individually. This can be achieved using hyperslabs.
For example:
class WriteString
{
public:
WriteString (hid_t dataset, hid_t datatype
, hid_t dataspace, hid_t memspace)
: m_dataset (dataset), m_datatype (datatype)
, m_dataspace (dataspace), m_memspace (memspace)
, m_pos () {}
private:
hid_t m_dataset;
hid_t m_datatype;
hid_t m_dataspace;
hid_t m_memspace;
int m_pos;
//...
public:
void operator ()(std::vector<std::string>::value_type const & v)
{
// Select the file position, 1 record at position 'pos'
hsize_t count[] = { 1 } ;
hsize_t offset[] = { m_pos++ } ;
H5Sselect_hyperslab( m_dataspace
, H5S_SELECT_SET
, offset
, NULL
, count
, NULL );
const char * s = v.c_str ();
H5Dwrite (m_dataset
, m_datatype
, m_memspace
, m_dataspace
, H5P_DEFAULT
, &s );
}
};
// ...
void writeVector (hid_t group, std::vector<std::string> const & v)
{
hsize_t dims[] = { m_files.size () } ;
hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
, dims, NULL);
dims[0] = 1;
hid_t memspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
, dims, NULL);
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);
hid_t dataset = H5Dcreate1 (group, "files", datatype
, dataspace, H5P_DEFAULT);
//
// Select the "memory" to be written out - just 1 record.
hsize_t offset[] = { 0 } ;
hsize_t count[] = { 1 } ;
H5Sselect_hyperslab( memspace, H5S_SELECT_SET, offset
, NULL, count, NULL );
std::for_each (v.begin ()
, v.end ()
, WriteStrings (dataset, datatype, dataspace, memspace));
H5Dclose (dataset);
H5Sclose (dataspace);
H5Sclose (memspace);
H5Tclose (datatype);
}