Reading a pdf Document and extracting the hyperlinks and and save it as a.csv file

- January 16, 2014

string FileName = string.Empty;
string FilePath = string.Empty;
string FileType = string.Empty;
string LinkLable = string.Empty;
string LinkURL = string.Empty;
int pagenumber;

OpenFileDialog dlg = new OpenFileDialog();
string filepath;
dlg.Filter = "PDF Files(*.PDF)|*.PDF|All Files (*.*)|*.*";
if (dlg.ShowDialog() == DialogResult.OK)
{
//// need to global verable to store file name and file path
filepath = dlg.FileName.ToString();

////String sourceFileName = "D:\\reader2.pdf";
//Setup some variables to be used later
PdfReader R = default(PdfReader);
int PageCount = 0;
PdfDictionary PageDictionary = default(PdfDictionary);
PdfArray Annots = default(PdfArray);

//Open our reader
R = new PdfReader(filepath);
//Get the page cont
PageCount = R.NumberOfPages;

//Loop through each page
//// here we can get the noof pages for apdf and we can get page number
for (int i = 1; i <= PageCount; i++)
{
//Get the current page
PageDictionary = R.GetPageN(i);

//Get all of the annotations for the current page
Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);
////bool k= Annots.Contains(PdfName.TEXT);
////Annotation.CONTENT.Replace("https://www.google.com/", "http://www.chinehamchat.com/");

//Make sure we have something
if ((Annots == null) || (Annots.Length == 0))
continue;
List<string> Ret = new List<string>();
//Loop through each annotation

foreach (PdfObject A in Annots.ArrayList)
{
//Convert the itext-specific object as a generic PDF object
PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);

//Make sure this annotation has a link
if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
continue;

//Make sure this annotation has an ACTION
if (AnnotationDictionary.Get(PdfName.A) == null)
continue;

//Get the ACTION for the current annotation
PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);

//Test if it is a URI action

if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
{
PdfString Link = AnnotationAction.GetAsString(PdfName.URI);
string linkReferenceBuilder = Link.ToString();

var LinkLocation = AnnotationDictionary.GetAsArray(PdfName.RECT);
List<string> linestringlist = new List<string>();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue);
RenderFilter[] renderFilter = new RenderFilter[1];
renderFilter[0] = new RegionTextRenderFilter(rect);
ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
string linkTextBuilder = PdfTextExtractor.GetTextFromPage(R, i, textExtractionStrategy).Trim();

FileName = dlg.SafeFileName.ToString();
FilePath = dlg.FileName.ToString();
FileType = dlg.SafeFileName.ToString().Split('.')[1];
LinkLable = linkTextBuilder.ToString();
LinkURL = linkReferenceBuilder.ToString();
pagenumber = i;
richTextBox1.Text += FileName + "," + FilePath + "," + FileType + "," + LinkLable + "," + LinkURL + "," + pagenumber + "," + "\n ";

}
}
}
StreamWriter File = new StreamWriter("E:\\" + dlg.SafeFileName.Split('.')[0] + DateTime.Now.Millisecond + ".csv" + "");
File.Write(richTextBox1.Text);
File.Close();

}

}

Search This Blog

SharePoint Shine

Reading a pdf Document and extracting the hyperlinks and and save it as a.csv file

Comments

Popular posts from this blog

My Interview Experience

Create multiple web parts in one SPFx Solution

SPFx Documentation Sharpoint Online