I want to be able to convert PDFs to CSV files and have found several useful scripts but, being new to Python, I have a question:
Where do you specify the filepath of the PDF and the CSV you want to print to?
I'm using Python 2.7.11 and PDFMiner 20140328.
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print data
if __name__ == '__main__':
pdfparser(sys.argv[1])
Here is some modified code from this SO answer written by tgray:
def pdf_to_csv(filename, separator, threshold):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
self.separator = separator
self.threshold = threshold
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda: {})
for child in self.cur_item._objs: # <-- changed
if isinstance(child, LTChar):
(_, _, x, y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) # <-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.line_creator(line)
self.outfp.write(self.line_creator(line))
self.outfp.write("\n")
def line_creator(self, line):
keys = sorted(line.keys())
# calculate the average distange between each character on this row
average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
# append the first character to the result
result = [line[keys[0]]]
for i in range(1, len(keys)):
# if the distance between this character and the last character is greater than the average*threshold
if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
# append the separator into that position
result.append(self.separator)
# append the character
result.append(line[keys[i]])
printable_line = ''.join(result)
return printable_line
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
fp = open(filename, 'rb')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(PDFPage.get_pages(fp)):
outfp.write("START PAGE %d\n" % i)
if page is not None:
print 'none'
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
if __name__ == '__main__':
# the separator to use with the CSV
separator = ';'
# the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well
threshold = 1.5
print pdf_to_csv('myLovelyFile.pdf', separator, threshold)
The main difference between the answer in the link and this one is the line_creator method, which tries to extract some structure out of the PDF.
Should work with PDFminer 20140328.