Reading a pdf Document and extracting the hyperlinks and and save it as a.csv file
string FileName = string.Empty;
string FilePath = string.Empty;
string FileType = string.Empty;
string LinkLable = string.Empty;
string LinkURL = string.Empty;
int pagenumber;
OpenFileDialog dlg = new OpenFileDialog();
string filepath;
dlg.Filter = "PDF Files(*.PDF)|*.PDF|All Files (*.*)|*.*";
if (dlg.ShowDialog() == DialogResult.OK)
{
//// need to global verable to store file name and file path
filepath = dlg.FileName.ToString();
////String sourceFileName = "D:\\reader2.pdf";
//Setup some variables to be used later
PdfReader R = default(PdfReader);
int PageCount = 0;
PdfDictionary PageDictionary = default(PdfDictionary);
PdfArray Annots = default(PdfArray);
//Open our reader
R = new PdfReader(filepath);
//Get the page cont
PageCount = R.NumberOfPages;
//Loop through each page
//// here we can get the noof pages for apdf and we can get page number
for (int i = 1; i <= PageCount; i++)
{
//Get the current page
PageDictionary = R.GetPageN(i);
//Get all of the annotations for the current page
Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);
////bool k= Annots.Contains(PdfName.TEXT);
////Annotation.CONTENT.Replace("https://www.google.com/", "http://www.chinehamchat.com/");
//Make sure we have something
if ((Annots == null) || (Annots.Length == 0))
continue;
List<string> Ret = new List<string>();
//Loop through each annotation
foreach (PdfObject A in Annots.ArrayList)
{
//Convert the itext-specific object as a generic PDF object
PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);
//Make sure this annotation has a link
if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
continue;
//Make sure this annotation has an ACTION
if (AnnotationDictionary.Get(PdfName.A) == null)
continue;
//Get the ACTION for the current annotation
PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);
//Test if it is a URI action
if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
{
PdfString Link = AnnotationAction.GetAsString(PdfName.URI);
string linkReferenceBuilder = Link.ToString();
var LinkLocation = AnnotationDictionary.GetAsArray(PdfName.RECT);
List<string> linestringlist = new List<string>();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue);
RenderFilter[] renderFilter = new RenderFilter[1];
renderFilter[0] = new RegionTextRenderFilter(rect);
ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
string linkTextBuilder = PdfTextExtractor.GetTextFromPage(R, i, textExtractionStrategy).Trim();
FileName = dlg.SafeFileName.ToString();
FilePath = dlg.FileName.ToString();
FileType = dlg.SafeFileName.ToString().Split('.')[1];
LinkLable = linkTextBuilder.ToString();
LinkURL = linkReferenceBuilder.ToString();
pagenumber = i;
richTextBox1.Text += FileName + "," + FilePath + "," + FileType + "," + LinkLable + "," + LinkURL + "," + pagenumber + "," + "\n ";
}
}
}
StreamWriter File = new StreamWriter("E:\\" + dlg.SafeFileName.Split('.')[0] + DateTime.Now.Millisecond + ".csv" + "");
File.Write(richTextBox1.Text);
File.Close();
}
}
string FilePath = string.Empty;
string FileType = string.Empty;
string LinkLable = string.Empty;
string LinkURL = string.Empty;
int pagenumber;
OpenFileDialog dlg = new OpenFileDialog();
string filepath;
dlg.Filter = "PDF Files(*.PDF)|*.PDF|All Files (*.*)|*.*";
if (dlg.ShowDialog() == DialogResult.OK)
{
//// need to global verable to store file name and file path
filepath = dlg.FileName.ToString();
////String sourceFileName = "D:\\reader2.pdf";
//Setup some variables to be used later
PdfReader R = default(PdfReader);
int PageCount = 0;
PdfDictionary PageDictionary = default(PdfDictionary);
PdfArray Annots = default(PdfArray);
//Open our reader
R = new PdfReader(filepath);
//Get the page cont
PageCount = R.NumberOfPages;
//Loop through each page
//// here we can get the noof pages for apdf and we can get page number
for (int i = 1; i <= PageCount; i++)
{
//Get the current page
PageDictionary = R.GetPageN(i);
//Get all of the annotations for the current page
Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);
////bool k= Annots.Contains(PdfName.TEXT);
////Annotation.CONTENT.Replace("https://www.google.com/", "http://www.chinehamchat.com/");
//Make sure we have something
if ((Annots == null) || (Annots.Length == 0))
continue;
List<string> Ret = new List<string>();
//Loop through each annotation
foreach (PdfObject A in Annots.ArrayList)
{
//Convert the itext-specific object as a generic PDF object
PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);
//Make sure this annotation has a link
if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
continue;
//Make sure this annotation has an ACTION
if (AnnotationDictionary.Get(PdfName.A) == null)
continue;
//Get the ACTION for the current annotation
PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);
//Test if it is a URI action
if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
{
PdfString Link = AnnotationAction.GetAsString(PdfName.URI);
string linkReferenceBuilder = Link.ToString();
var LinkLocation = AnnotationDictionary.GetAsArray(PdfName.RECT);
List<string> linestringlist = new List<string>();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue);
RenderFilter[] renderFilter = new RenderFilter[1];
renderFilter[0] = new RegionTextRenderFilter(rect);
ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
string linkTextBuilder = PdfTextExtractor.GetTextFromPage(R, i, textExtractionStrategy).Trim();
FileName = dlg.SafeFileName.ToString();
FilePath = dlg.FileName.ToString();
FileType = dlg.SafeFileName.ToString().Split('.')[1];
LinkLable = linkTextBuilder.ToString();
LinkURL = linkReferenceBuilder.ToString();
pagenumber = i;
richTextBox1.Text += FileName + "," + FilePath + "," + FileType + "," + LinkLable + "," + LinkURL + "," + pagenumber + "," + "\n ";
}
}
}
StreamWriter File = new StreamWriter("E:\\" + dlg.SafeFileName.Split('.')[0] + DateTime.Now.Millisecond + ".csv" + "");
File.Write(richTextBox1.Text);
File.Close();
}
}
Comments