C++
  1  //This file contains extremely crude C source code to extract plain text
  2  //from a PDF file. It is only intended to show some of the basics involved
  3  //in the process and by no means good enough for commercial use.
  4  //But it can be easily modified to suit your purpose. Code is by no means
  5  //warranted to be bug free or suitable for any purpose.
  6  //
  7  //Adobe has a web site that converts PDF files to text for free,
  8  //so why would you need something like this? Several reasons:
  9  //
 10  //1) This code is entirely free including for commericcial use. It only
 11  //   requires ZLIB (from www.zlib.org) which is entirely free as well.
 12  //
 13  //2) This code tries to put tabs into appropriate places in the text,
 14  //   which means that if your PDF file contains mostly one large table,
 15  //   you can easily take the output of this program and directly read it
 16  //   into Excel! Otherwise if you select and copy the text and paste it into
 17  //   Excel there is no way to extract the various columns again.
 18  //
 19  //This code assumes that the PDF file has text objects compressed
 20  //using FlateDecode (which seems to be standard).
 21  //
 22  //This code is free. Use it for any purpose.
 23  //The author assumes no liability whatsoever for the use of this code.
 24  //Use it at your own risk!
 25  
 26  
 27  //PDF file strings (based on PDFReference15_v5.pdf from www.adobve.com:
 28  //
 29  //BT = Beginning of a text object, ET = end of a text object
 30  //5 Ts = superscript
 31  //-5 Ts = subscript
 32  //Td move to start next line
 33  
 34  //No precompiled headers, but uncomment if need be:
 35  #include "stdafx.h"
 36  
 37  #include <stdio.h>
 38  #include <windows.h>
 39  
 40  //YOur project must also include zdll.lib (ZLIB) as a dependency.
 41  //ZLIB can be freely downloaded from the internet, www.zlib.org
 42  //Use 4 byte struct alignment in your project!
 43  
 44  #include "zlib.h"
 45  
 46  //Find a string in a buffer:
 47  size_t FindStringInBuffer (char* buffer, char* search, size_t buffersize)
 48  {
 49  	char* buffer0 = buffer;
 50  
 51  	size_t len = strlen(search);
 52  	bool fnd = false;
 53  	while (!fnd)
 54  	{
 55  		fnd = true;
 56  		for (size_t i=0; i<len; i++)
 57  		{
 58  			if (buffer[i]!=search[i])
 59  			{
 60  				fnd = false;
 61  				break;
 62  			}
 63  		}
 64  		if (fnd) return buffer - buffer0;
 65  		buffer = buffer + 1;
 66  		if (buffer - buffer0 + len >= buffersize) return -1;
 67  	}
 68  	return -1;
 69  }
 70  
 71  //Keep this many previous recent characters for back reference:
 72  #define oldchar 15
 73  
 74  //Convert a recent set of characters into a number if there is one.
 75  //Otherwise return -1:
 76  float ExtractNumber(const char* search, int lastcharoffset)
 77  {
 78  	int i = lastcharoffset;
 79  	while (i>0 && search[i]==' ') i--;
 80  	while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
 81  	float flt=-1.0;
 82  	char buffer[oldchar+5]; ZeroMemory(buffer,sizeof(buffer));
 83  	strncpy(buffer, search+i+1, lastcharoffset-i);
 84  	if (buffer[0] && sscanf(buffer, "%f", &flt))
 85  	{
 86  		return flt;
 87  	}
 88  	return -1.0;
 89  }
 90  
 91  //Check if a certain 2 character token just came along (e.g. BT):
 92  bool seen2(const char* search, char* recent)
 93  {
 94  if (    recent[oldchar-3]==search[0] 
 95       && recent[oldchar-2]==search[1] 
 96  	 && (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d || recent[oldchar-1]==0x0a) 
 97  	 && (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d || recent[oldchar-4]==0x0a)
 98  	 )
 99  	{
100  		return true;
101  	}
102  	return false;
103  }
104  
105  //This method processes an uncompressed Adobe (text) object and extracts text.
106  void ProcessOutput(FILE* file, char* output, size_t len)
107  {
108  	//Are we currently inside a text object?
109  	bool intextobject = false;
110  
111  	//Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
112  	bool nextliteral = false;
113  	
114  	//() Bracket nesting level. Text appears inside ()
115  	int rbdepth = 0;
116  
117  	//Keep previous chars to get extract numbers etc.:
118  	char oc[oldchar];
119  	int j=0;
120  	for (j=0; j<oldchar; j++) oc[j]=' ';
121  
122  	for (size_t i=0; i<len; i++)
123  	{
124  		char c = output[i];
125  		if (intextobject)
126  		{
127  			if (rbdepth==0 && seen2("TD", oc))
128  			{
129  				//Positioning.
130  				//See if a new line has to start or just a tab:
131  				float num = ExtractNumber(oc,oldchar-5);
132  				if (num>1.0)
133  				{
134  					fputc(0x0d, file);
135  					fputc(0x0a, file);
136  				}
137  				if (num<1.0)
138  				{
139  					fputc('\t', file);
140  				}
141  			}
142  			if (rbdepth==0 && seen2("ET", oc))
143  			{
144  				//End of a text object, also go to a new line.
145  				intextobject = false;
146  				fputc(0x0d, file);
147  				fputc(0x0a, file);
148  			}
149  			else if (c=='(' && rbdepth==0 && !nextliteral) 
150  			{
151  				//Start outputting text!
152  				rbdepth=1;
153  				//See if a space or tab (>1000) is called for by looking
154  				//at the number in front of (
155  				int num = ExtractNumber(oc,oldchar-1);
156  				if (num>0)
157  				{
158  					if (num>1000.0)
159  					{
160  						fputc('\t', file);
161  					}
162  					else if (num>100.0)
163  					{
164  						fputc(' ', file);
165  					}
166  				}
167  			}
168  			else if (c==')' && rbdepth==1 && !nextliteral) 
169  			{
170  				//Stop outputting text
171  				rbdepth=0;
172  			}
173  			else if (rbdepth==1) 
174  			{
175  				//Just a normal text character:
176  				if (c=='\\' && !nextliteral)
177  				{
178  					//Only print out next character no matter what. Do not interpret.
179  					nextliteral = true;
180  				}
181  				else
182  				{
183  					nextliteral = false;
184  					if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
185  					{
186  						fputc(c, file);
187  					}
188  				}
189  			}
190  		}
191  		//Store the recent characters for when we have to go back for a number:
192  		for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
193  		oc[oldchar-1]=c;
194  		if (!intextobject)
195  		{
196  			if (seen2("BT", oc))
197  			{
198  				//Start of a text object:
199  				intextobject = true;
200  			}
201  		}
202  	}
203  }
204  
205  int _tmain(int argc, _TCHAR* argv[])
206  {
207  	//Discard existing output:
208  	FILE* fileo = fopen("c:\\pdf\\output2.txt", "w");
209  	if (fileo) fclose(fileo);
210  	fileo = fopen("c:\\pdf\\output2.txt", "a");
211  
212  	//Open the PDF source file:
213  	FILE* filei = fopen("c:\\pdf\\somepdf.pdf", "rb");
214  
215  	if (filei && fileo)
216  	{
217  		//Get the file length:
218  		int fseekres = fseek(filei,0, SEEK_END);   //fseek==0 if ok
219  		long filelen = ftell(filei);
220  		fseekres = fseek(filei,0, SEEK_SET);
221  
222  		//Read ethe ntire file into memory (!):
223  		char* buffer = new char [filelen]; ZeroMemory(buffer, filelen);
224  		size_t actualread = fread(buffer, filelen, 1 ,filei);  //must return 1
225  
226  		bool morestreams = true;
227  
228  		//Now search the buffer repeated for streams of data:
229  		while (morestreams)
230  		{
231  			//Search for stream, endstream. We ought to first check the filter
232  			//of the object to make sure it if FlateDecode, but skip that for now!
233  			size_t streamstart = FindStringInBuffer (buffer, "stream", filelen);
234  			size_t streamend   = FindStringInBuffer (buffer, "endstream", filelen);
235  			if (streamstart>0 && streamend>streamstart)
236  			{
237  				//Skip to beginning and end of the data stream:
238  				streamstart += 6;
239  
240  				if (buffer[streamstart]==0x0d && buffer[streamstart+1]==0x0a) streamstart+=2;
241  				else if (buffer[streamstart]==0x0a) streamstart++;
242  
243  				if (buffer[streamend-2]==0x0d && buffer[streamend-1]==0x0a) streamend-=2;
244  				else if (buffer[streamend-1]==0x0a) streamend--;
245  
246  				//Assume output will fit into 10 times input buffer:
247  				size_t outsize = (streamend - streamstart)*10;
248  				char* output = new char [outsize]; ZeroMemory(output, outsize);
249  
250  				//Now use zlib to inflate:
251  				z_stream zstrm; ZeroMemory(&zstrm, sizeof(zstrm));
252  
253  				zstrm.avail_in = streamend - streamstart + 1;
254  				zstrm.avail_out = outsize;
255  				zstrm.next_in = (Bytef*)(buffer + streamstart);
256  				zstrm.next_out = (Bytef*)output;
257  
258  				int rsti = inflateInit(&zstrm);
259  				if (rsti == Z_OK)
260  				{
261  					int rst2 = inflate (&zstrm, Z_FINISH);
262  					if (rst2 >= 0)
263  					{
264  						//Ok, got something, extract the text:
265  						size_t totout = zstrm.total_out;
266  						ProcessOutput(fileo, output, totout);
267  					}
268  				}
269  				delete[] output; output=0;
270  				buffer+= streamend + 7;
271  				filelen = filelen - (streamend+7);
272  			}
273  			else
274  			{
275  				morestreams = false;
276  			}
277  		}
278  		fclose(filei);
279  	}
280  	if (fileo) fclose(fileo);
281  	return 0;
282  }