There are some tools which allow to extract the whole text portion of a PDF file in order to full text index the PDF.
What I need is a way to search for certain strings and, if thery were found in the PDF file, return the page number?
This example uses the library included with Adobe Reader, and comes from http://www.dotnetspider.com/resources/5040-Get-PDF-Page-Number.aspx:
using Acrobat;
using AFORMAUTLib;
private void pdfRandD(string fPath)
{
AcroPDDocClass objPages = new AcroPDDocClass();
objPages.Open(fPath);
long TotalPDFPages = objPages.GetNumPages();
objPages.Close();
AcroAVDocClass avDoc = new AcroAVDocClass();
avDoc.Open(fPath, "Title");
IAFormApp formApp = new AFormAppClass();
IFields myFields = (IFields)formApp.Fields;
string searchWord = "Search String";
string k = "";
StreamWriter sw = new
StreamWriter(@"D:\KCG_FileChecker_Inputs\MAC\pdf\0230_525490_23_cha17.txt", false);
for (int p = 0; p < TotalPDFPages; p++)
{
int numWords = int.Parse(myFields.ExecuteThisJavascript("event.value=this.getPageNumWords(" + p + ");"));
k = "";
for (int i = 0; i < numWords; i++)
{
string chkWord = myFields.ExecuteThisJavascript("event.value=this.getPageNthWord(" + p + "," + i + ", true);");
k = k + " " + chkWord;
}
if(k.Trim().Contains(searchWord))
{
int pNum = int.Parse(myFields.ExecuteThisJavascript("event.value=this.getPageLabel(" + p + ",true);"));
sw.WriteLine("The Word " + searchWord + " is exists in " + pNum);
}
}
sw.Close();
MessageBox.Show("Process completed");
}