Click here to Skip to main content
15,891,708 members
Please Sign up or sign in to vote.
1.00/5 (1 vote)
See more:
C#
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;
using PDFjet.NET;

namespace PDFtoTextConversion
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            Document newDoc = new Document(PageSize.A2.Rotate());

            try
            {

                richTextBox1.Text = extractPDFText(@"D:\pdffile\windows_server_2003_c4.pdf", 3, 4);
            }
            catch (DocumentException de)
            {
                MessageBox.Show(de.Message);
            }
            catch (IOException ioEx)
            {
                MessageBox.Show(ioEx.Message);
            }
            finally
            {
                newDoc.Close();
            }
        }
        public string extractPDFText(string sourcePDF, int fromPageNo, int toPageNo)
        {
            StringBuilder sb = new StringBuilder();
            PdfReader reader = new PdfReader(sourcePDF);
            byte[] pageBytes = null;
            PRTokeniser token = null;
            int tokenType = -1;
            string tokenValue = string.Empty;

            if (fromPageNo == 0)
            {
                fromPageNo = 1;
            }

            if (toPageNo == 0)
            {
                toPageNo = reader.NumberOfPages;
            }

            for (int i = fromPageNo; i < toPageNo; i++)
            {
                pageBytes = reader.GetPageContent(i);

                if (pageBytes != null)
                {
                    token = new PRTokeniser(pageBytes);
                    while (token.NextToken())
                    {
                        tokenType = (int)token.TokenType;
                        tokenValue = token.StringValue;
                        if (tokenType == (int)PRTokeniser.TokType.STRING)
                        {
                            sb.Append(token.StringValue);
                        }
                        else if (tokenType == 1 && tokenValue.Equals("-600"))
                        {
                            sb.Append(" ");
                        }
                        else if (tokenType == 10 && tokenValue.Equals("TJ"))
                        {
                            sb.Append(" ");
                        }
                    }
                }
            }

            return sb.ToString();
        }
    }
}
Posted
Updated 21-Jan-13 19:11pm
v2
Comments
Sergey Alexandrovich Kryukov 22-Jan-13 1:10am    
In what line?
And what is unclear in error message? See a help page on this type...
—SA
Sachin_Sharma(10) 22-Jan-13 1:16am    
token = new PRTokeniser(pageBytes);
In this line it says: The best overloaded method match for 'iTextSharp.text.pdf.PRTokeniser.PRTokeniser(iTextSharp.text.pdf.RandomAccessFileOrArray)' has some invalid arguments
Sachin_Sharma(10) 22-Jan-13 1:23am    
Can you help?
Sergey Alexandrovich Kryukov 22-Jan-13 1:28am    
Well, thanks for clarification. Next time, just comment a line with the error in code and refer to this comment in the text of the question. I don't know what needs help... well, tried. Please see my answer.
—SA
Sachin_Sharma(10) 22-Jan-13 2:23am    
Thanks to you sir, I make you sure next time I'll comment the line ...

This is what is required for the parameter of this constructor:
http://api.itextpdf.com/itext/com/itextpdf/text/pdf/RandomAccessFileOrArray.html[^].

And, according to the error message, you are trying to pass byte[], which is apparently not assignment-compatible with required type. What could be more clear?

The requires type implements this interface: http://docs.oracle.com/javase/1.5.0/docs/api/java/io/DataInput.html[^].

So, the page referenced above shows how to pass data to the instance of the class uses as a parameter of the constructor. And so on…

Probably, your problem is really just the style of your work. I don't know what is the exact problem, but instead of just reading the documentation step by step and implementing your steps accordingly, you… are doing who knows what, may be trial-and-error, and asking strange questions. You need to act in a straightforward way. Everything is well documented.

—SA
 
Share this answer
 
const string sourcePdf = @"c:\temp\arquivo.pdf";
var sb = new StringBuilder();
var reader = new PdfReader(sourcePdf);

var fromPageNo = 0;
if (fromPageNo == 0)
{
	fromPageNo = 1;
}

var toPageNo = 0;
if (toPageNo == 0)
{
	toPageNo = reader.NumberOfPages;
}

for (int i = fromPageNo; i < toPageNo; i++)
{
	byte[] pageBytes = reader.GetPageContent(i);

	if (pageBytes != null)
	{
		var token = new PRTokeniser(new RandomAccessFileOrArray(pageBytes));
		while (token.NextToken())
		{
			var tokenType = (int)token.TokenType;
			var tokenValue = token.StringValue;
			if (tokenType == (int)PRTokeniser.TokType.STRING)
			{
				sb.Append(token.StringValue);
			}
			else if (tokenType == 1 && tokenValue.Equals("-600"))
			{
				sb.Append(" ");
			}
			else if (tokenType == 10 && tokenValue.Equals("TJ"))
			{
				sb.Append("-> ");
			}
		}
	}
}

Console.WriteLine(sb.ToString());
Console.ReadKey();

https://www.youtube.com/user/tutorialnacional[]
 
Share this answer
 

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900