I find this question, but it uses command line, and I do not want to call a Python script in command line using subprocess and parse HTML files to get the font information.
I want to use PDFminer as a library, and I find this question, but they are just all about extracting plain texts, without other information such as font name, font size, and so on.
#!/usr/bin/env python
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
def createPDFDoc(fpath):
fp = open(fpath, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, password='')
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise "Not extractable"
else:
return document
def createDeviceInterpreter():
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return device, interpreter
def parse_obj(objs):
for obj in objs:
if isinstance(obj, pdfminer.layout.LTTextBox):
for o in obj._objs:
if isinstance(o,pdfminer.layout.LTTextLine):
text=o.get_text()
if text.strip():
for c in o._objs:
if isinstance(c, pdfminer.layout.LTChar):
print "fontname %s"%c.fontname
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
else:
pass
document=createPDFDoc("/tmp/simple.pdf")
device,interpreter=createDeviceInterpreter()
pages=PDFPage.create_pages(document)
interpreter.process_page(pages.next())
layout = device.get_result()
parse_obj(layout._objs)