I need to run some analysis my extracting data from a PDF document.
Using iTextSharp
, I used the PdfTextExtractor.GetTextFromPage
method to extract contents from a PDF document and it returned me in a single long line.
Is there a way to get the text by line so that i can store them in an array? So that i can analyze the data by line which will be more flexible.
Below is the code I used:
string urlFileName1 = "pdf_link";
PdfReader reader = new PdfReader(urlFileName1);
string text = string.Empty;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
text += PdfTextExtractor.GetTextFromPage(reader, page);
}
reader.Close();
candidate3.Text = text.ToString();
public void ExtractTextFromPdf(string path)
{
using (PdfReader reader = new PdfReader(path))
{
StringBuilder text = new StringBuilder();
ITextExtractionStrategy Strategy = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
for (int i = 1; i <= reader.NumberOfPages; i++)
{
string page = "";
page = PdfTextExtractor.GetTextFromPage(reader, i,Strategy);
string[] lines = page.Split('\n');
foreach (string line in lines)
{
MessageBox.Show(line);
}
}
}
}