I'm using Python's multiprocessing.Manager
to share access to a dataset that one process will generate and others will view. However, I'm running into the problem that a dict proxy returned by manager.dict()
doesn't support iteritems()
.
I could iterate over items()
, but that means constructing a new tuple of all the items in the dict, which is a large number. Is there a way to do it without constructing an intermediate list/tuple, thus using only a constant amount of extra memory?
Note: It's OK if the solution requires that the generating process pauses for the iteration.
You could iterate over keys()
to reduce your memory footprint. You'd have to guard against keys being deleted.
Otherwise, here's an example with two different ways that will let you iterate through the items in a dict. The iteritems()
method in this example only works from the process that creates the manager object and the child process that the manager object creates. That's because the manager object is needed to create new proxies, and other processes don't have access to it. The iteritems2()
method works from other processes, since it doesn't rely on creating a new proxy in those processes.
import multiprocessing as mp
import multiprocessing.managers
class mydict(dict):
def __init__(self, *args, **kwargs):
dict.__init__(self, *args, **kwargs)
self.iters = {}
def iteritems(self):
print "iteritems", mp.current_process()
return dict.iteritems(self)
def _iteritems_start(self):
print "_iteritems_start", mp.current_process()
i = dict.iteritems(self)
self.iters[id(i)] = i
return id(i)
def _iteritems_next(self, iter_id):
try:
return self.iters[iter_id].next()
except StopIteration:
del self.iters[iter_id]
return None
class mydict_proxy(mp.managers.DictProxy):
def iteritems(self):
print "iteritems proxy", mp.current_process()
return self._callmethod("iteritems")
def iteritems2(self):
print "iteritems2 proxy", mp.current_process()
iter_id = self._callmethod("_iteritems_start")
def generator():
while True:
a = self._callmethod("_iteritems_next",
(iter_id,))
if a == None:
return
yield a
return generator()
_method_to_typeid_ = { "iteritems": "Iterator" }
_exposed_ = mp.managers.DictProxy._exposed_
_exposed_ += ("iteritems", "_iteritems_start", "_iteritems_next")
class mymanager(mp.managers.BaseManager):
pass
mymanager.register("mydict", mydict, mydict_proxy)
mymanager.register("Iterator", proxytype = mp.managers.IteratorProxy,
create_method = False)
def other(d):
for k, v in d.iteritems2():
d[k] = v.lower()
for k, v in d.iteritems():
d[k] = ord(v)
def main():
manager = mymanager()
manager.start()
d = manager.mydict(list(enumerate("ABCDEFGHIJKLMNOP")))
for (k, v) in d.iteritems():
print k, v
proc = mp.Process(target = other, args = (d,))
proc.start()
proc.join()
for (k, v) in d.iteritems():
print k, v
if __name__ == "__main__":
main()
Note that while this code may be more memory efficient, it's probably going to be a heck of a lot slower.