I am trying to create an LMDB database for my Caffe machine learning project. But LMDB throws an error ont the first attempt to insert a data point, saying the environment mapsize is full.
Here's the code that attempts to populate the database:
import numpy as np
from PIL import Image
import os
import lmdb
import random
# my data structure for holding image/label pairs
from serialization import DataPoint
class LoadImages(object):
def __init__(self, image_data_path):
self.image_data_path = image_data_path
self.dirlist = os.listdir(image_data_path)
# find the number of images that are to be read from disk
# in this case there are 370 images.
num = len(self.dirlist)
# shuffle the list of image files so that they are read in a random order
random.shuffle(self.dirlist)
map_size = num*10
j=0
# load images from disk
for image_filename in os.listdir(image_data_path):
# check that every image belongs to either category _D_ or _P_
assert (image_filename[:3] == '_D_' or image_filename[:3] == '_P_'), "ERROR: unknown category"
# set up the LMDB datbase object
env = lmdb.open('image_lmdb', map_size=map_size)
with env.begin(write=True) as txn:
# iterate over (shuffled) list of image files
for image_filename in self.dirlist:
print "Loading " + str(j) + "th image from disk - percentage complete: " + str((float(j)/num) * 100) + " %"
# open the image
with open(str(image_data_path + "/" + image_filename), 'rb') as f:
image = Image.open(f)
npimage = np.asarray(image, dtype=np.float64)
# discard alpha channel, if necessary
if npimage.shape[2] == 4:
npimage = npimage[:,:,:3]
print image_filename + " had its alpha channel removed."
# get category
if image_filename[:3] == '_D_':
category = 0
elif image_filename[:3] == '_P_':
category = 1
# wrap image data and label into a serializable data structure
datapoint = DataPoint(npimage, category)
serialized_datapoint = datapoint.serialize()
# a database key
str_id = '{:08}'.format(j)
# put the data point in the LMDB
txn.put(str_id.encode('ascii'), serialized_datapoint)
j+=1
I also made a little data structure to hold images and labels and serialize them, which is used above:
import numpy as np
class DataPoint(object):
def __init__(self, image=None, label=None, dtype=np.float64):
self.image = image
if self.image is not None:
self.image = self.image.astype(dtype)
self.label = label
def serialize(self):
image_string = self.image.tobytes()
label_string = chr(self.label)
datum_string = label_string + image_string
return datum_string
def deserialize(self, string):
image_string = string[1:]
label_string = string[:1]
image = np.fromstring(image_string, dtype=np.float64)
label = ord(label_string)
return DataPoint(image, label)
Here's the error:
/usr/bin/python2.7 /home/hal9000/PycharmProjects/Caffe_Experiments_0.6/gather_images.py
Loading 0th image from disk - percentage complete: 0.0 %
Traceback (most recent call last):
File "/home/hal9000/PycharmProjects/Caffe_Experiments_0.6/gather_images.py", line 69, in <module>
g = LoadImages(path)
File "/home/hal9000/PycharmProjects/Caffe_Experiments_0.6/gather_images.py", line 62, in __init__
txn.put(str_id.encode('ascii'), serialized_datapoint)
lmdb.MapFullError: mdb_put: MDB_MAP_FULL: Environment mapsize limit reached
map size is the maximum size of the whole DB, including metadata - it appears you used the number of expected records.
you increase this number