I would like to improve the performance of a Python script and have been using cProfile
to generate a performance report:
python -m cProfile -o chrX.prof ./bgchr.py ...args...
I opened this chrX.prof
file with Python's pstats
and printed out the statistics:
Python 2.7 (r27:82500, Oct 5 2010, 00:24:22)
[GCC 4.1.2 20080704 (Red Hat 4.1.2-44)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pstats
>>> p = pstats.Stats('chrX.prof')
>>> p.sort_stats('name')
>>> p.print_stats()
Sun Oct 10 00:37:30 2010 chrX.prof
8760583 function calls in 13.780 CPU seconds
Ordered by: function name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.000 0.000 {_locale.setlocale}
1 1.128 1.128 1.128 1.128 {bz2.decompress}
1 0.002 0.002 13.780 13.780 {execfile}
1750678 0.300 0.000 0.300 0.000 {len}
48 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'close' of 'file' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1750676 0.496 0.000 0.496 0.000 {method 'join' of 'str' objects}
1 0.007 0.007 0.007 0.007 {method 'read' of 'file' objects}
1 0.000 0.000 0.000 0.000 {method 'readlines' of 'file' objects}
1 0.034 0.034 0.034 0.034 {method 'rstrip' of 'str' objects}
23 0.000 0.000 0.000 0.000 {method 'seek' of 'file' objects}
1757785 1.230 0.000 1.230 0.000 {method 'split' of 'str' objects}
1 0.000 0.000 0.000 0.000 {method 'startswith' of 'str' objects}
1750676 0.872 0.000 0.872 0.000 {method 'write' of 'file' objects}
1 0.007 0.007 13.778 13.778 ./bgchr:3(<module>)
1 0.000 0.000 13.780 13.780 <string>:1(<module>)
1 0.001 0.001 0.001 0.001 {open}
1 0.000 0.000 0.000 0.000 {sys.exit}
1 0.000 0.000 0.000 0.000 ./bgchr:36(checkCommandLineInputs)
1 0.000 0.000 0.000 0.000 ./bgchr:27(checkInstallation)
1 1.131 1.131 13.701 13.701 ./bgchr:97(extractData)
1 0.003 0.003 0.007 0.007 ./bgchr:55(extractMetadata)
1 0.064 0.064 13.771 13.771 ./bgchr:5(main)
1750677 8.504 0.000 11.196 0.000 ./bgchr:122(parseJarchLine)
1 0.000 0.000 0.000 0.000 ./bgchr:72(parseMetadata)
1 0.000 0.000 0.000 0.000 /home/areynolds/proj/tools/lib/python2.7/locale.py:517(setlocale)
Question: What can I do about join
, split
and write
operations to reduce the apparent impact they have on the performance of this script?
If it is relevant, here is the full source code to the script in question:
#!/usr/bin/env python
import sys, os, time, bz2, locale
def main(*args):
# Constants
global metadataRequiredFileSize
metadataRequiredFileSize = 8192
requiredVersion = (2,5)
# Prep
global whichChromosome
whichChromosome = "all"
checkInstallation(requiredVersion)
checkCommandLineInputs()
extractMetadata()
parseMetadata()
if whichChromosome == "--list":
listMetadata()
sys.exit(0)
# Extract
extractData()
return 0
def checkInstallation(rv):
currentVersion = sys.version_info
if currentVersion[0] == rv[0] and currentVersion[1] >= rv[1]:
pass
else:
sys.stderr.write( "\n\t[%s] - Error: Your Python interpreter must be %d.%d or greater (within major version %d)\n" % (sys.argv[0], rv[0], rv[1], rv[0]) )
sys.exit(-1)
return
def checkCommandLineInputs():
cmdName = sys.argv[0]
argvLength = len(sys.argv[1:])
if (argvLength == 0) or (argvLength > 2):
sys.stderr.write( "\n\t[%s] - Usage: %s [<chromosome> | --list] <bjarch-file>\n\n" % (cmdName, cmdName) )
sys.exit(-1)
else:
global inFile
global whichChromosome
if argvLength == 1:
inFile = sys.argv[1]
elif argvLength == 2:
whichChromosome = sys.argv[1]
inFile = sys.argv[2]
if inFile == "-" or inFile == "--list":
sys.stderr.write( "\n\t[%s] - Usage: %s [<chromosome> | --list] <bjarch-file>\n\n" % (cmdName, cmdName) )
sys.exit(-1)
return
def extractMetadata():
global metadataList
global dataHandle
metadataList = []
dataHandle = open(inFile, 'rb')
try:
for data in dataHandle.readlines(metadataRequiredFileSize):
metadataLine = data
metadataLines = metadataLine.split('\n')
for line in metadataLines:
if line:
metadataList.append(line)
except IOError:
sys.stderr.write( "\n\t[%s] - Error: Could not extract metadata from %s\n\n" % (sys.argv[0], inFile) )
sys.exit(-1)
return
def parseMetadata():
global metadataList
global metadata
metadata = []
if not metadataList: # equivalent to "if len(metadataList) > 0"
sys.stderr.write( "\n\t[%s] - Error: No metadata in %s\n\n" % (sys.argv[0], inFile) )
sys.exit(-1)
for entryText in metadataList:
if entryText: # equivalent to "if len(entryText) > 0"
entry = entryText.split('\t')
filename = entry[0]
chromosome = entry[0].split('.')[0]
size = entry[1]
entryDict = { 'chromosome':chromosome, 'filename':filename, 'size':size }
metadata.append(entryDict)
return
def listMetadata():
for index in metadata:
chromosome = index['chromosome']
filename = index['filename']
size = long(index['size'])
sys.stdout.write( "%s\t%s\t%ld" % (chromosome, filename, size) )
return
def extractData():
global dataHandle
global pLength
global lastEnd
locale.setlocale(locale.LC_ALL, 'POSIX')
dataHandle.seek(metadataRequiredFileSize, 0) # move cursor past metadata
for index in metadata:
chromosome = index['chromosome']
size = long(index['size'])
pLength = 0L
lastEnd = ""
if whichChromosome == "all" or whichChromosome == index['chromosome']:
dataStream = dataHandle.read(size)
uncompressedData = bz2.decompress(dataStream)
lines = uncompressedData.rstrip().split('\n')
for line in lines:
parseJarchLine(chromosome, line)
if whichChromosome == chromosome:
break
else:
dataHandle.seek(size, 1) # move cursor past chromosome chunk
dataHandle.close()
return
def parseJarchLine(chromosome, line):
global pLength
global lastEnd
elements = line.split('\t')
if len(elements) > 1:
if lastEnd:
start = long(lastEnd) + long(elements[0])
lastEnd = long(start + pLength)
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, start, lastEnd, '\t'.join(elements[1:])))
else:
lastEnd = long(elements[0]) + long(pLength)
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, long(elements[0]), lastEnd, '\t'.join(elements[1:])))
else:
if elements[0].startswith('p'):
pLength = long(elements[0][1:])
else:
start = long(long(lastEnd) + long(elements[0]))
lastEnd = long(start + pLength)
sys.stdout.write("%s\t%ld\t%ld\n" % (chromosome, start, lastEnd))
return
if __name__ == '__main__':
sys.exit(main(*sys.argv))
EDIT
If I comment out the sys.stdout.write
statement in the first conditional of parseJarchLine()
, then my runtime goes from 10.2 sec to 4.8 sec:
# with first conditional's "sys.stdout.write" enabled
$ time ./bgchr chrX test.bjarch > /dev/null
real 0m10.186s
user 0m9.917s
sys 0m0.160s
# after first conditional's "sys.stdout.write" is commented out
$ time ./bgchr chrX test.bjarch > /dev/null
real 0m4.808s
user 0m4.561s
sys 0m0.156s
Is writing to stdout
really that expensive in Python?
ncalls
is relevant only to the extent that comparing the numbers against other counts such as number of chars/fields/lines in a file may highligh anomalies; tottime
and cumtime
is what really matters. cumtime
is the time spent in the function/method including the time spent in the functions/methods that it calls; tottime
is the time spent in the function/method excluding the time spent in the functions/methods that it calls.
I find it helpful to sort the stats on tottime
and again on cumtime
, not on name
.
bgchar
definitely refers to the execution of the script and is not irrelevant as it takes up 8.9 seconds out of 13.5; that 8.9 seconds does NOT include time in the functions/methods that it calls! Read carefully what @Lie Ryan says about modularising your script into functions, and implement his advice. Likewise what @jonesy says.
string
is mentioned because you import string
and use it in only one place: string.find(elements[0], 'p')
. On another line in the output you'll notice that string.find was called only once, so it's not a performance problem in this run of this script. HOWEVER: You use str
methods everywhere else. string
functions are deprecated nowadays and are implemented by calling the corresponding str
method. You would be better writing elements[0].find('p') == 0
for an exact but faster equivalent, and might like to use elements[0].startswith('p')
which would save readers wondering whether that == 0
should actually be == -1
.
The four methods mentioned by @Bernd Petersohn take up only 3.7 seconds out of a total execution time of 13.541 seconds. Before worrying too much about those, modularise your script into functions, run cProfile again, and sort the stats by tottime
.
Update after question revised with changed script:
"""Question: What can I do about join, split and write operations to reduce the apparent impact they have on the performance of this script?""
Huh? Those 3 together take 2.6 seconds out of the total of 13.8. Your parseJarchLine function is taking 8.5 seconds (which doesn't include time taken by functions/methods that it calls. assert(8.5 > 2.6)
Bernd has already pointed you at what you might consider doing with those. You are needlessly splitting the line completely only to join it up again when writing it out. You need to inspect only the first element. Instead of elements = line.split('\t')
do elements = line.split('\t', 1)
and replace '\t'.join(elements[1:])
by elements[1]
.
Now let's dive into the body of parseJarchLine. The number of uses in the source and manner of the uses of the long
built-in function are astonishing. Also astonishing is the fact that long
is not mentioned in the cProfile output.
Why do you need long
at all? Files over 2 Gb? OK, then you need to consider that since Python 2.2, int
overflow causes promotion to long
instead of raising an exception. You can take advantage of faster execution of int
arithmetic. You also need to consider that doing long(x)
when x
is already demonstrably a long
is a waste of resources.
Here is the parseJarchLine function with removing-waste changes marked [1] and changing-to-int changes marked [2]. Good idea: make changes in small steps, re-test, re-profile.
def parseJarchLine(chromosome, line):
global pLength
global lastEnd
elements = line.split('\t')
if len(elements) > 1:
if lastEnd != "":
start = long(lastEnd) + long(elements[0])
# [1] start = lastEnd + long(elements[0])
# [2] start = lastEnd + int(elements[0])
lastEnd = long(start + pLength)
# [1] lastEnd = start + pLength
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, start, lastEnd, '\t'.join(elements[1:])))
else:
lastEnd = long(elements[0]) + long(pLength)
# [1] lastEnd = long(elements[0]) + pLength
# [2] lastEnd = int(elements[0]) + pLength
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, long(elements[0]), lastEnd, '\t'.join(elements[1:])))
else:
if elements[0].startswith('p'):
pLength = long(elements[0][1:])
# [2] pLength = int(elements[0][1:])
else:
start = long(long(lastEnd) + long(elements[0]))
# [1] start = lastEnd + long(elements[0])
# [2] start = lastEnd + int(elements[0])
lastEnd = long(start + pLength)
# [1] lastEnd = start + pLength
sys.stdout.write("%s\t%ld\t%ld\n" % (chromosome, start, lastEnd))
return
Update after question about sys.stdout.write
If the statement that you commented out was anything like the original one:
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, start, lastEnd, '\t'.join(elements[1:])))
Then your question is ... interesting. Try this:
payload = "%s\t%ld\t%ld\t%s\n" % (chromosome, start, lastEnd, '\t'.join(elements[1:]))
sys.stdout.write(payload)
Now comment out the sys.stdout.write
statement ...
By the way, someone mentioned in a comment about breaking this into more than one write ... have you considered this? How many bytes on average in elements[1:] ? In chromosome?
=== change of topic: It worries me that you initialise lastEnd
to ""
rather than to zero, and that nobody has commented on it. Any way, you should fix this, which allows a rather drastic simplification plus adding in others' suggestions:
def parseJarchLine(chromosome, line):
global pLength
global lastEnd
elements = line.split('\t', 1)
if elements[0][0] == 'p':
pLength = int(elements[0][1:])
return
start = lastEnd + int(elements[0])
lastEnd = start + pLength
sys.stdout.write("%s\t%ld\t%ld" % (chromosome, start, lastEnd))
if elements[1:]:
sys.stdout.write(elements[1])
sys.stdout.write(\n)
Now I'm similarly worried about the two global variables lastEnd
and pLength
-- the parseJarchLine function is now so small that it can be folded back into the body of its sole caller, extractData
, which saves two global variables, and a gazillion function calls. You could also save a gazillion lookups of sys.stdout.write
by putting write = sys.stdout.write
once up the front of extractData
and using that instead.
BTW, the script tests for Python 2.5 or better; have you tried profiling on 2.5 and 2.6?