I'm writing a web app that extracts a line at the top of each page in a PDF. The PDFs come from different versions of a product and could go through a number of PDF printers, also in different versions and also different settings.
So far using PDFSharp and iTextSharp I have managed to get it to work for all versions of PDFs. My hang-up is with documents that have CID fonts (Identity-H).
I have written a partial parser to find the font table reference and the text blocks, but converting these to readable text is beating me.
Does anyone have either: - a parser (like this one https://stackoverflow.com/a/1732265/5169050) that copes with CID fonts; or - some example code for how to parse a pages resource dictionary to find the pages fonts and get its ToUnicode stream to help finish off this example (https://stackoverflow.com/a/4048328/5169050)
We have to use iTextSharp 4.1 to retain the free-to-use license.
Here's my partial parser.
public string ExtractTextFromCIDPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
// Holds the final result to be returned
string resultString = "";
// Are we in a block of text or not
bool blnInText = false;
// Holds each line of text before written to resultString
string phrase = "";
// Holds the 4-character hex codes as they are built
string hexCode = "";
// Are we in a font reference or not (much like a code block)
bool blnInFontRef = false;
// Holds the last font reference and therefore the CMAP table
// to be used for any text found after it
string currentFontRef = "";
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
switch (c)
{
case '<':
{
blnInText = true;
break;
}
case '>':
{
resultString = resultString + Environment.NewLine + phrase;
phrase = "";
blnInText = false;
break;
}
case 'T':
{
switch (((char)input[i + 1]).ToString().ToLower())
{
case "f":
{
// Tf represents the start of a font table reference
blnInFontRef = true;
currentFontRef = "";
break;
}
case "d":
{
// Td represents the end of a font table reference or
// the start of a text block
blnInFontRef = false;
break;
}
}
break;
}
default:
{
if (blnInText)
{
// We are looking for 4-character blocks of hex characters
// These will build up a number which refers to the index
// of the glyph in the CMAP table, which will give us the
// character
hexCode = hexCode + c;
if (hexCode.Length == 4)
{
// TODO - translate code to character
char translatedHexCode = c;
phrase = phrase + translatedHexCode;
// Blank it out ready for the next 4
hexCode = "";
}
}
else
{
if (blnInFontRef)
{
currentFontRef = currentFontRef + c;
}
}
break;
}
}
}
return resultString;
}
catch
{
return "";
}
}
It took a while but I finally have some code to read plain text from a Identity-H encoded PDF. I post it here to help others, and I know there will be ways to improve upon it. For instance, I haven't touched character mappings (beginbfchar) and my ranges are not actually ranges. I've spent over a week on this already and can't justify the time unless we hit files that work differently. Sorry.
Usage:
PdfDocument inputDocument = PDFHelpers.Open(physcialFilePath, PdfDocumentOpenMode.Import)
foreach (PdfPage page in inputDocument.Pages)
{
for (Int32 index = 0; index < page.Contents.Elements.Count; index++)
{
PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
String outputText = new PDFParser().ExtractTextFromPDFBytes(stream.Value).Replace(" ", String.Empty);
if (outputText == "" || outputText.Replace("\n\r", "") == "")
{
// Identity-H encoded file
string[] hierarchy = new string[] { "/Resources", "/Font", "/F*" };
List<PdfItem> fonts = PDFHelpers.FindObjects(hierarchy, page, true);
outputText = PDFHelpers.FromUnicode(stream, fonts);
}
}
}
And the actual helper class, which I'll post in its entirety, because they are all used in the example, and be because I've found so few complete examples myself when I was trying to solve this issue. The helper uses both PDFSharp and iTextSharp to be able to able to open PDFs pre- and post-1.5, ExtractTextFromPDFBytes to read in a standard PDF, and my FindObjects (to search the document tree and return objects) and FromUnicode that takes encrypted texts and a fonts collection to translate it.
using PdfSharp.Pdf;
using PdfSharp.Pdf.Content;
using PdfSharp.Pdf.Content.Objects;
using System;
using System.Collections.Generic;
using System.IO;
namespace PdfSharp.Pdf.IO
{
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public class PDFHelpers
{
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(string PdfPath, PdfDocumentOpenMode openmode)
{
return Open(PdfPath, null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(string PdfPath, string password, PdfDocumentOpenMode openmode)
{
using (FileStream fileStream = new FileStream(PdfPath, FileMode.Open, FileAccess.Read))
{
int len = (int)fileStream.Length;
// TODO: Setting this byteArray causes the out of memory exception which is why we
// have the 70mb limit. Solve this and we can increase the file size limit
System.Diagnostics.Process proc = System.Diagnostics.Process.GetCurrentProcess();
long availableMemory = proc.PrivateMemorySize64 / 1024 / 1024; //Mb of RAM allocated to this process that cannot be shared with other processes
if (availableMemory < (fileStream.Length / 1024 / 1024))
{
throw new Exception("The available memory " + availableMemory + "Mb is not enough to open, split and save a file of " + fileStream.Length / 1024 / 1024);
}
try
{
Byte[] fileArray = new Byte[len];
fileStream.Read(fileArray, 0, len);
fileStream.Close();
fileStream.Dispose();
PdfDocument result = Open(fileArray, openmode);
if (result.FullPath == "")
{
// The file was converted to a v1.4 document and only exists as a document in memory
// Save over the original file so other references to the file get the compatible version
// TODO: It would be good if we could do this conversion without opening every document another 2 times
PdfDocument tempResult = Open(fileArray, PdfDocumentOpenMode.Modify);
iTextSharp.text.pdf.BaseFont bfR = iTextSharp.text.pdf.BaseFont.CreateFont(Environment.GetEnvironmentVariable("SystemRoot") + "\\fonts\\arial.ttf", iTextSharp.text.pdf.BaseFont.IDENTITY_H, iTextSharp.text.pdf.BaseFont.EMBEDDED);
bfR.Subset = false;
tempResult.Save(PdfPath);
tempResult.Close();
tempResult.Dispose();
result = Open(fileArray, openmode);
}
return result;
}
catch (OutOfMemoryException)
{
fileStream.Close();
fileStream.Dispose();
throw;
}
}
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(byte[] fileArray, PdfDocumentOpenMode openmode)
{
return Open(new MemoryStream(fileArray), null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(byte[] fileArray, string password, PdfDocumentOpenMode openmode)
{
return Open(new MemoryStream(fileArray), password, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(MemoryStream sourceStream, PdfDocumentOpenMode openmode)
{
return Open(sourceStream, null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(MemoryStream sourceStream, string password, PdfDocumentOpenMode openmode)
{
PdfDocument outDoc = null;
sourceStream.Position = 0;
try
{
outDoc = (password == null) ?
PdfReader.Open(sourceStream, openmode) :
PdfReader.Open(sourceStream, password, openmode);
sourceStream.Position = 0;
MemoryStream outputStream = new MemoryStream();
iTextSharp.text.pdf.PdfReader reader = (password == null) ?
new iTextSharp.text.pdf.PdfReader(sourceStream) :
new iTextSharp.text.pdf.PdfReader(sourceStream, System.Text.ASCIIEncoding.ASCII.GetBytes(password));
System.Collections.ArrayList fontList = iTextSharp.text.pdf.BaseFont.GetDocumentFonts(reader, 1);
}
catch (PdfSharp.Pdf.IO.PdfReaderException)
{
//workaround if pdfsharp doesn't support this pdf
sourceStream.Position = 0;
MemoryStream outputStream = new MemoryStream();
iTextSharp.text.pdf.PdfReader reader = (password == null) ?
new iTextSharp.text.pdf.PdfReader(sourceStream) :
new iTextSharp.text.pdf.PdfReader(sourceStream, System.Text.ASCIIEncoding.ASCII.GetBytes(password));
iTextSharp.text.pdf.PdfStamper pdfStamper = new iTextSharp.text.pdf.PdfStamper(reader, outputStream);
pdfStamper.FormFlattening = true;
pdfStamper.Writer.SetPdfVersion(iTextSharp.text.pdf.PdfWriter.PDF_VERSION_1_4);
pdfStamper.Writer.CloseStream = false;
pdfStamper.Close();
outDoc = PdfReader.Open(outputStream, openmode);
}
return outDoc;
}
/// <summary>
/// Uses a recurrsive function to step through the PDF document tree to find the specified objects.
/// </summary>
/// <param name="objectHierarchy">An array of the names of objects to look for in the tree. Wildcards can be used in element names, e.g., /F*. The order represents
/// a top-down hierarchy if followHierarchy is true.
/// If a single object is passed in array it should be in the level below startingObject, or followHierarchy set to false to find it anywhere in the tree</param>
/// <param name="startingObject">A PDF object to parse. This will likely be a document or a page, but could be any lower-level item</param>
/// <param name="followHierarchy">If true the order of names in the objectHierarchy will be used to search only that branch. If false the whole tree will be parsed for
/// any items matching those in objectHierarchy regardless of position</param>
static public List<PdfItem> FindObjects(string[] objectHierarchy, PdfItem startingObject, bool followHierarchy)
{
List<PdfItem> results = new List<PdfItem>();
FindObjects(objectHierarchy, startingObject, followHierarchy, ref results, 0);
return results;
}
static private void FindObjects(string[] objectHierarchy, PdfItem startingObject, bool followHierarchy, ref List<PdfItem> results, int Level)
{
PdfName[] keyNames = ((PdfDictionary)startingObject).Elements.KeyNames;
foreach (PdfName keyName in keyNames)
{
bool matchFound = false;
if (!followHierarchy)
{
// We need to check all items for a match, not just the top one
for (int i = 0; i < objectHierarchy.Length; i++)
{
if (keyName.Value == objectHierarchy[i] ||
(objectHierarchy[i].Contains("*") &&
(keyName.Value.StartsWith(objectHierarchy[i].Substring(0, objectHierarchy[i].IndexOf("*") - 1)) &&
keyName.Value.EndsWith(objectHierarchy[i].Substring(objectHierarchy[i].IndexOf("*") + 1)))))
{
matchFound = true;
}
}
}
else
{
// Check the item in the hierarchy at this level for a match
if (Level < objectHierarchy.Length && (keyName.Value == objectHierarchy[Level] ||
(objectHierarchy[Level].Contains("*") &&
(keyName.Value.StartsWith(objectHierarchy[Level].Substring(0, objectHierarchy[Level].IndexOf("*") - 1)) &&
keyName.Value.EndsWith(objectHierarchy[Level].Substring(objectHierarchy[Level].IndexOf("*") + 1))))))
{
matchFound = true;
}
}
if (matchFound)
{
PdfItem item = ((PdfDictionary)startingObject).Elements[keyName];
if (item != null && item is PdfSharp.Pdf.Advanced.PdfReference)
{
item = ((PdfSharp.Pdf.Advanced.PdfReference)item).Value;
}
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString() + " - " + keyName.ToString() + " matched");
if (Level == objectHierarchy.Length - 1)
{
// We are at the end of the hierarchy, so this is the target
results.Add(item);
}
else if (!followHierarchy)
{
// We are returning every matching object so add it
results.Add(item);
}
// Call back to this function to search lower levels
Level++;
FindObjects(objectHierarchy, item, followHierarchy, ref results, Level);
Level--;
}
else
{
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString() + " - " + keyName.ToString() + " unmatched");
}
}
Level--;
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString());
}
/// <summary>
/// Uses the Font object to translate CID encoded text to readable text
/// </summary>
/// <param name="unreadableText">The text stream that needs to be decoded</param>
/// <param name="font">A List of PDFItems containing the /Font object containing a /ToUnicode with a CMap</param>
static public string FromUnicode(PdfDictionary.PdfStream unreadableText, List<PdfItem> PDFFonts)
{
Dictionary<string, string[]> fonts = new Dictionary<string, string[]>();
// Get the CMap from each font in the passed array and store them by font name
for (int font = 0; font < PDFFonts.Count; font++)
{
PdfName[] keyNames = ((PdfDictionary)PDFFonts[font]).Elements.KeyNames;
foreach (PdfName keyName in keyNames)
{
if (keyName.Value == "/ToUnicode") {
PdfItem item = ((PdfDictionary)PDFFonts[font]).Elements[keyName];
if (item != null && item is PdfSharp.Pdf.Advanced.PdfReference)
{
item = ((PdfSharp.Pdf.Advanced.PdfReference)item).Value;
}
string FontName = "/F" + font.ToString();
string CMap = ((PdfDictionary)item).Stream.ToString();
if (CMap.IndexOf("beginbfrange") > 0)
{
CMap = CMap.Substring(CMap.IndexOf("beginbfrange") + "beginbfrange".Length);
if (CMap.IndexOf("endbfrange") > 0)
{
CMap = CMap.Substring(0, CMap.IndexOf("endbfrange") - 1);
string[] CMapArray = CMap.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
fonts.Add(FontName, CMapArray);
}
}
break;
}
}
}
// Holds the final result to be returned
string resultString = "";
// Break the input text into lines
string[] lines = unreadableText.ToString().Split(new string[] {"\n"} , StringSplitOptions.RemoveEmptyEntries);
// Holds the last font reference and therefore the CMAP table
// to be used for any text found after it
string[] currentFontRef = fonts["/F0"];
// Are we in a block of text or not? They can break across lines so we need an identifier
bool blnInText = false;
for (int line = 0; line < lines.Length; line++)
{
string thisLine = lines[line].Trim();
if (thisLine == "q")
{
// I think this denotes the start of a text block, and where we need to reset to the default font
currentFontRef = fonts["/F0"];
}
else if (thisLine.IndexOf(" Td <") != -1)
{
thisLine = thisLine.Substring(thisLine.IndexOf(" Td <") + 5);
blnInText = true;
}
if (thisLine.EndsWith("Tf"))
{
// This is a font assignment. Take note of this and use this fonts ToUnicode map when we find text
if (fonts.ContainsKey(thisLine.Substring(0, thisLine.IndexOf(" "))))
{
currentFontRef = fonts[thisLine.Substring(0, thisLine.IndexOf(" "))];
}
}
else if (thisLine.EndsWith("> Tj"))
{
thisLine = thisLine.Substring(0, thisLine.IndexOf("> Tj"));
}
if(blnInText)
{
// This is a text block
try
{
// Get the section of codes that exist between angled brackets
string unicodeStr = thisLine;
// Wrap every group of 4 characters in angle brackets
// This will directly match the items in the CMap but also allows the next for to avoid double-translating items
unicodeStr = "<" + String.Join("><", unicodeStr.SplitInParts(4)) + ">";
for (int transform = 0; transform < currentFontRef.Length; transform++)
{
// Get the last item in the line, which is the unicode value of the glyph
string glyph = currentFontRef[transform].Substring(currentFontRef[transform].IndexOf("<"));
glyph = glyph.Substring(0, glyph.IndexOf(">") + 1);
string counterpart = currentFontRef[transform].Substring(currentFontRef[transform].LastIndexOf("<") + 1);
counterpart = counterpart.Substring(0, counterpart.LastIndexOf(">"));
// Replace each item that matches with the translated counterpart
// Insert a \\u before every 4th character so it's a C# unicode compatible string
unicodeStr = unicodeStr.Replace(glyph, "\\u" + counterpart);
if (unicodeStr.IndexOf(">") == 0)
{
// All items have been replaced, so lets get outta here
break;
}
}
resultString = resultString + System.Text.RegularExpressions.Regex.Unescape(unicodeStr);
}
catch
{
return "";
}
}
if (lines[line].Trim().EndsWith("> Tj"))
{
blnInText = false;
if (lines[line].Trim().IndexOf(" 0 Td <") == -1)
{
// The vertical coords have changed, so add a new line
resultString = resultString + Environment.NewLine;
}
else
{
resultString = resultString + " ";
}
}
}
return resultString;
}
// Credit to http://stackoverflow.com/questions/4133377/
private static IEnumerable<String> SplitInParts(this String s, Int32 partLength)
{
if (s == null)
throw new ArgumentNullException("s");
if (partLength <= 0)
throw new ArgumentException("Part length has to be positive.", "partLength");
for (var i = 0; i < s.Length; i += partLength)
yield return s.Substring(i, Math.Min(partLength, s.Length - i));
}
}
}
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}
}
return false;
}
#endregion
}
Thank you to all those who provided help and snippets that allowed me to finally pull a working solution together