I need to extract data from .PDF files and load it in to SQL 2008. Can any one tell me how to proceed??
Here is an example of how to use iTextSharp to extract text data from a PDF. You'll have to fiddle with it some to make it do exactly what you want, I think it's a good outline. You can see how the StringBuilder is being used to store the text, but you could easily change that to use SQL.
static void Main(string[] args)
{
PdfReader reader = new PdfReader(@"c:\test.pdf");
StringBuilder builder = new StringBuilder();
for (int x = 1; x <= reader.NumberOfPages; x++)
{
PdfDictionary page = reader.GetPageN(x);
IRenderListener listener = new SBTextRenderer(builder);
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
PdfDictionary pageDic = reader.GetPageN(x);
PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic);
}
}
public class SBTextRenderer : IRenderListener
{
private StringBuilder _builder;
public SBTextRenderer(StringBuilder builder)
{
_builder = builder;
}
#region IRenderListener Members
public void BeginTextBlock()
{
}
public void EndTextBlock()
{
}
public void RenderImage(ImageRenderInfo renderInfo)
{
}
public void RenderText(TextRenderInfo renderInfo)
{
_builder.Append(renderInfo.GetText());
}
#endregion
}