Click here to Skip to main content
15,885,244 members
Articles / Web Development / ASP.NET

From pdf files to plain text in a WebMatrix site

Rate me:
Please Sign up or sign in to vote.
4.10/5 (3 votes)
14 Mar 2013CPOL2 min read 22.4K   270   8  
How to use the PDFBox Java library in an ASP.NET Web Pages project
using System;
using System.Collections.Generic;
using System.Web;
using java.util;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;

public class PdfFile
{

    public string Author { get; set; }

    public string Content { get; set; }

    public DateTime Created { get; set; }

    public string Creator { get; set; }

    public string Keywords { get; set; }

    public DateTime Modified { get; set; }

    public string Producer { get; set; }

    public string Subject { get; set; }

    public string Title { get; set; }

    public string Trapped { get; set; }

    public static DateTime CalendarToDateTime(Calendar calendar)
    {
        if (calendar != null)
        {
            int year = calendar.get(Calendar.YEAR);
            int month = calendar.get(Calendar.MONTH) + 1;
            int day = calendar.get(Calendar.DAY_OF_MONTH);
            int hour = calendar.get(Calendar.HOUR_OF_DAY);
            int minute = calendar.get(Calendar.MINUTE);
            int second = calendar.get(Calendar.SECOND);
            int millis = calendar.get(Calendar.MILLISECOND);

            var date = new DateTime(year, month, day, hour, minute, second, millis);

            return date;
        }

        else {
            return DateTime.MinValue;
        }
    }
    
    
    public PdfFile(string FilePath)
    {
        PDDocument PdfDoc = PDDocument.load(FilePath);
        PDDocumentInformation PdfInfo = PdfDoc.getDocumentInformation();

        Title = (PdfInfo.getTitle() ?? "");
        Subject = (PdfInfo.getSubject() ?? "");
        Author = (PdfInfo.getAuthor() ?? "");
        Creator = (PdfInfo.getCreator() ?? "");
        Producer = (PdfInfo.getProducer() ?? "");
        Keywords = (PdfInfo.getKeywords() ?? "");
        Trapped = (PdfInfo.getTrapped() ?? "");
        Created = CalendarToDateTime(PdfInfo.getCreationDate());
        Modified = CalendarToDateTime(PdfInfo.getModificationDate());

        PDFTextStripper stripper = new PDFTextStripper();
        Content = stripper.getText(PdfDoc);
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Chief Technology Officer Federfarma Pavia
Italy Italy
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions