Click here to Skip to main content
15,890,399 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
Hello, I face a problem when I crawl lots of web pages(there are more than 50,000 url to process), it excute very slow,so I want to refactor it with thread,can anybody give some idea,thank you very much!
import ......
public class Down2011CaseMeshTread extends Thread {
	
    public static int count = 0;
    public static List<String> docDoiList = getCaseDoiList2();
    private static URL url;  
    private static String doi;
    static int BUFFER_SIZE = 1024*10;
    
    public Down2011CaseMeshTread(String doi) throws MalformedURLException{
    	String urlStr = "http://www.codeproject.com/" + doi;
    	this.url = new URL(urlStr);
    	this.doi = doi;
    }
    
    public static Connection getConnection() throws Exception {
        String driver = "com.mysql.jdbc.Driver";
        String url = "jdbc:mysql://192.168.1.102:3306/clef11";
        String username = "root";
        String password = "111111";
        Class.forName(driver);
        Connection conn = DriverManager.getConnection(url, username, password);
        return conn;
      }
    
    public static String getDocNameByDoi(String docDoi){
        ResultSet rs = null;
        Connection conn = null;
        PreparedStatement pstmt = null;
        String docName = null;
        try {
          conn = getConnection();
          String query = "select filename from casebase where doi = ?";
          pstmt = conn.prepareStatement(query); // create a statement
          pstmt.setString(1, docDoi); // set input parameter
          rs = pstmt.executeQuery();
          while(rs.next()){
        	  String name = rs.getNString(1);
        	  docName = name.subSequence(0, name.lastIndexOf(".")).toString();
          }
        } catch (Exception e) {
          e.printStackTrace();
        } finally {
          try {
            rs.close();
            pstmt.close();
            conn.close();
          } catch (SQLException e) {
            e.printStackTrace();
          }
        }
		return docName;
    }
    
	public void Test() throws IOException, InterruptedException{
		
		StringBuffer sb = null;
		BufferedReader in = null;
		BufferedWriter out = null;
		try {
			 sb = new StringBuffer();
			 int ch =0;
			 URLConnection conn = (HttpURLConnection)url.openConnection();
			 conn.setRequestProperty("User-Agent", "Mozilla/4.76 (compatible; MSIE 5.0; Windows NT; DigExt)");
			 conn.setDoOutput(true);
			 conn.setConnectTimeout(1000*60*10);
			 in = new BufferedReader(new InputStreamReader(url.openStream()));
		     FileOutputStream fo = new FileOutputStream("/home/boge/workspace1/IDF/case_mesh/" + getDocNameByDoi(doi));
			 OutputStreamWriter writer = new OutputStreamWriter(fo, "utf-8");
			 out = new BufferedWriter(writer);
			 while (!in.ready())
			 {
			      Thread.sleep(500); // wait for stream to be ready.
			 }
			 char[] buffer = new char[BUFFER_SIZE]; 
	                 int charsRead = 0;
	                 while ( (charsRead = in.read(buffer, 0, BUFFER_SIZE)) != -1 ) {
	                 out.write(buffer, 0, charsRead);
	         }
		 out.close();
		 in.close();
	    }catch(Exception e){
	    	e.printStackTrace();
	    }
	}
	
	public void run(){
		try {
			Test();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
	
	public static List<String> getCaseDoiList2(){
		List<String> docDoiList = new LinkedList<String>();
		BufferedReader br = null;
		try{
			br = new BufferedReader(new InputStreamReader(new FileInputStream("/home/boge/5.28.3")));
			String line = null;
			while((line = br.readLine())!=null){
				docDoiList.add(line.trim());
			}
		}catch(Exception e){
			e.printStackTrace();
		}
		return docDoiList;
	}
	
	public static List<String> getFiles(String fileName) {
		List<String> fileList = new ArrayList<String>();
               File directory = new File(fileName);
               for (File file : directory.listFiles()) {
        	  if (file.isFile() && !file.isHidden()) {
        		fileList.add(file.getName());
        	}
        }
        return fileList;	
	}
	
	public static void main(String args[]) throws MalformedURLException{
		List<String> filedowns = getFiles("/home/boge/workspace1/IDF/case_mesh");
		for(String docDoi : docDoiList){
			if(filedowns.contains(getDocNameByDoi(docDoi))){
				continue;
			}else{
				Down2011CaseMeshTread down = new Down2011CaseMeshTread(docDoi);
				down.start();
			}
		}
	}
}


But this code can work correctly, can you help me! :)
Posted

1 solution

You can use the ThreadPoolExecutor to split your application's payload into Runnable tasks.
Your code is a bit too long for me to clean up so I'll just show you an example of how to download web pages in parallel using this approach;

Java
package threadtest;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
public class Program {
  
  private class Downloader implements Runnable {
    private final URL url;
    
    public Downloader(URL url) {
      this.url = url;
    }
    
    private String readAll(Reader reader) throws IOException {
      StringBuilder builder = new StringBuilder();
      int read = 0;
      while((read = reader.read()) != -1) {
        builder.append((char)read);
      }
      return builder.toString();
    }
    
    @Override
    public void run() {
      try {
        Reader reader = null;
        try {
          reader = new BufferedReader(new InputStreamReader(url.openStream()));
          String result = readAll(reader);
          System.out.printf("Read %d characters from %s\n", result.length(), url);
        }
        finally {
          if (reader != null) 
              reader.close();
        }        
      }
      catch(IOException e) {
        System.err.println(e);
      }
    }
  }
  
  
  public void runIt() throws MalformedURLException {
    BlockingQueue<Runnable> runnables = new ArrayBlockingQueue<Runnable>(1024);
    ThreadPoolExecutor executor = new ThreadPoolExecutor(8, 16, 60, TimeUnit.SECONDS, runnables);
    
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    executor.submit(new Downloader(new URL("http://www.google.com")));
    
    executor.shutdown();
  }
  public static void main(String[] args) throws IOException {
    
    Program program = new Program();
    program.runIt();
    
    System.in.read();
  }
}



Hope this helps,
Fredrik Bornander
 
Share this answer
 
Comments
tiancehngbo 31-May-11 21:57pm    
thank you very much , I tried it with

ExecutorServiceexe = Executors.newFixedThreadPool(POOL_SIZE);

But I face the them problem that after download 1000 pages ,it goes to error:
java.io.IOException: Premature EOF
and then , it goes the error:
Connection time out
I don't what's the problem ,maybe you solution can help ,I have a try now!
Fredrik Bornander 2-Jun-11 4:52am    
Without seeing your code there's no way to tell you what is wrong.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900