I would like to improve the performance of a Python script and have been using cProfile
to generate a performance report:
python -m cProfile -o chrX.prof ./bgchr.py ...args...
I opened this chrX.prof
file with Python's pstats
and printed out the statistics:
Python 2.7 (r27:82500, Oct 5 2010, 00:24:22)
[GCC 4.1.2 20080704 (Red Hat 4.1.2-44)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pstats
>>> p = pstats.Stats('chrX.prof')
>>> p.sort_stats('name')
>>> p.print_stats()
Sun Oct 10 00:37:30 2010 chrX.prof
8760583 function calls in 13.780 CPU seconds
Ordered by: function name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.000 0.000 {_locale.setlocale}
1 1.128 1.128 1.128 1.128 {bz2.decompress}
1 0.002 0.002 13.780 13.780 {execfile}
1750678 0.300 0.000 0.300 0.000 {len}
48 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'close' of 'file' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1750676 0.496 0.000 0.496 0.000 {method 'join' of 'str' objects}
1 0.007 0.007 0.007 0.007 {method 'read' of 'file' objects}
1 0.000 0.000 0.000 0.000 {method 'readlines' of 'file' objects}
1 0.034 0.034 0.034 0.034 {method 'rstrip' of 'str' objects}
23 0.000 0.000 0.000 0.000 {method 'seek' of 'file' objects}
1757785 1.230 0.000 1.230 0.000 {method 'split' of 'str' objects}
1 0.000 0.000 0.000 0.000 {method 'startswith' of 'str' objects}
1750676 0.872 0.000 0.872 0.000 {method 'write' of 'file' objects}
1 0.007 0.007 13.778 13.778 ./bgchr:3(<module>)
1 0.000 0.000 13.780 13.780 <string>:1(<module>)
1 0.001 0.001 0.001 0.001 {open}
1 0.000 0.000 0.000 0.000 {sys.exit}
1 0.000 0.000 0.000 0.000 ./bgchr:36(checkCommandLineInputs)
1 0.000 0.000 0.000 0.000 ./bgchr:27(checkInstallation)
1 1.131 1.131 13.701 13.701 ./bgchr:97(extractData)
1 0.003 0.003 0.007 0.007 ./bgchr:55(extractMetadata)
1 0.064 0.064 13.771 13.771 ./bgchr:5(main)
1750677 8.504 0.000 11.196 0.000 ./bgchr:122(parseJarchLine)
1 0.000 0.000 0.000 0.000 ./bgchr:72(parseMetadata)
1 0.000 0.000 0.000 0.000 /home/areynolds/proj/tools/lib/python2.7/locale.py:517(setlocale)
Question: What can I do about join
, split
and write
operations to reduce the apparent impact they have on the performance of this script?
If it is relevant, here is the full source code to the script in question:
#!/usr/bin/env python
import sys, os, time, bz2, locale
def main(*args):
# Constants
global metadataRequiredFileSize
metadataRequiredFileSize = 8192
requiredVersion = (2,5)
# Prep
global whichChromosome
whichChromosome = "all"
checkInstallation(requiredVersion)
checkCommandLineInputs()
extractMetadata()
parseMetadata()
if whichChromosome == "--list":
listMetadata()
sys.exit(0)
# Extract
extractData()
return 0
def checkInstallation(rv):
currentVersion = sys.version_info
if currentVersion[0] == rv[0] and currentVersion[1] >= rv[1]:
pass
else:
sys.stderr.write( "\n\t[%s] - Error: Your Python interpreter must be %d.%d or greater (within major version %d)\n" % (sys.argv[0], rv[0], rv[1], rv[0]) )
sys.exit(-1)
return
def checkCommandLineInputs():
cmdName = sys.argv[0]
argvLength = len(sys.argv[1:])
if (argvLength == 0) or (argvLength > 2):
sys.stderr.write( "\n\t[%s] - Usage: %s [<chromosome> | --list] <bjarch-file>\n\n" % (cmdName, cmdName) )
sys.exit(-1)
else:
global inFile
global whichChromosome
if argvLength == 1:
inFile = sys.argv[1]
elif argvLength == 2:
whichChromosome = sys.argv[1]
inFile = sys.argv[2]
if inFile == "-" or inFile == "--list":
sys.stderr.write( "\n\t[%s] - Usage: %s [<chromosome> | --list] <bjarch-file>\n\n" % (cmdName, cmdName) )
sys.exit(-1)
return
def extractMetadata():
global metadataList
global dataHandle
metadataList = []
dataHandle = open(inFile, 'rb')
try:
for data in dataHandle.readlines(metadataRequiredFileSize):
metadataLine = data
metadataLines = metadataLine.split('\n')
for line in metadataLines:
if line:
metadataList.append(line)
except IOError:
sys.stderr.write( "\n\t[%s] - Error: Could not extract metadata from %s\n\n" % (sys.argv[0], inFile) )
sys.exit(-1)
return
def parseMetadata():
global metadataList
global metadata
metadata = []
if not metadataList: # equivalent to "if len(metadataList) > 0"
sys.stderr.write( "\n\t[%s] - Error: No metadata in %s\n\n" % (sys.argv[0], inFile) )
sys.exit(-1)
for entryText in metadataList:
if entryText: # equivalent to "if len(entryText) > 0"
entry = entryText.split('\t')
filename = entry[0]
chromosome = entry[0].split('.')[0]
size = entry[1]
entryDict = { 'chromosome':chromosome, 'filename':filename, 'size':size }
metadata.append(entryDict)
return
def listMetadata():
for index in metadata:
chromosome = index['chromosome']
filename = index['filename']
size = long(index['size'])
sys.stdout.write( "%s\t%s\t%ld" % (chromosome, filename, size) )
return
def extractData():
global dataHandle
global pLength
global lastEnd
locale.setlocale(locale.LC_ALL, 'POSIX')
dataHandle.seek(metadataRequiredFileSize, 0) # move cursor past metadata
for index in metadata:
chromosome = index['chromosome']
size = long(index['size'])
pLength = 0L
lastEnd = ""
if whichChromosome == "all" or whichChromosome == index['chromosome']:
dataStream = dataHandle.read(size)
uncompressedData = bz2.decompress(dataStream)
lines = uncompressedData.rstrip().split('\n')
for line in lines:
parseJarchLine(chromosome, line)
if whichChromosome == chromosome:
break
else:
dataHandle.seek(size, 1) # move cursor past chromosome chunk
dataHandle.close()
return
def parseJarchLine(chromosome, line):
global pLength
global lastEnd
elements = line.split('\t')
if len(elements) > 1:
if lastEnd:
start = long(lastEnd) + long(elements[0])
lastEnd = long(start + pLength)
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, start, lastEnd, '\t'.join(elements[1:])))
else:
lastEnd = long(elements[0]) + long(pLength)
sys.stdout.write("%s\t%ld\t%ld\t%s\n" % (chromosome, long(elements[0]), lastEnd, '\t'.join(elements[1:])))
else:
if elements[0].startswith('p'):
pLength = long(elements[0][1:])
else:
start = long(long(lastEnd) + long(elements[0]))
lastEnd = long(start + pLength)
sys.stdout.write("%s\t%ld\t%ld\n" % (chromosome, start, lastEnd))
return
if __name__ == '__main__':
sys.exit(main(*sys.argv))
EDIT
If I comment out the sys.stdout.write
statement in the first conditional of parseJarchLine()
, then my runtime goes from 10.2 sec to 4.8 sec:
# with first conditional's "sys.stdout.write" enabled
$ time ./bgchr chrX test.bjarch > /dev/null
real 0m10.186s
user 0m9.917s
sys 0m0.160s
# after first conditional's "sys.stdout.write" is commented out
$ time ./bgchr chrX test.bjarch > /dev/null
real 0m4.808s
user 0m4.561s
sys 0m0.156s
Is writing to stdout
really that expensive in Python?