2013年6月6日 星期四

A simple Crawler (簡單的網路爬蟲)

We can make a simple crawler with using Java Servlet & JSP .

 A crawler can be used to make a search engine .

This is description with the crawler .


[PDF]Building a WebCrawler and Search Engine

First , the crawler in servlet has a framework as below :

This project consists of 1 main directory, 4 sub directories , 1 main
page(html) , 1 servlet DD , 4 java files , 7 classes , and 1 document
file as the list below:

- [hw5] --- index.html
              - readme.txt
              - [WEB-INF] --- web.xml
                                     - [classes] --- [mvc] --- HelloController.java
                                                                       - HelloController.class
                                                                       - HelloModel.java
                                                                       - HelloMOdel.class
                                                                       - HelloView.java
                                                                       - HelloView.class
                                                                       - HelloResult.java
                                                                       - HelloResult.class
                                                                       - Bfs.class
                                                                       - Queue.class
                                                                       - WebSource.class
- [hw5]    
- [WEB-INF]               
- [classes]       
- [mvc]

- index.html                              ( first page for crawler         )
- web.xml                                 ( the configuraion of all servlet     )
- HelloController.java               ( process the HTTP request and response    )
- HelloModel.java                     ( main process and crawler , url match     )
- HelloView.java                       ( show the result of crawler and search    )      
- HelloResult.java                    ( show the search result          )
- readme.txt                             ( document for homework 5         )

 key code of this servlet , with a not perfect MVC framweork .

<HelloController.java>

// // controller name // @WebServlet ("/hello.do") public class HelloController extends HttpServlet { private HelloModel model = new HelloModel(); @Override protected void doGet (HttpServletRequest request ,HttpServletResponse response ) throws ServletException , IOException { // //get request parameter // String url = request.getParameter("url"); String max = request.getParameter("max"); // //call "doHello" method in "HelloModel" Object to process attribute "name" // Queue visted_list = model.doHello(url,max); // //let result "message" become to request object and attribute. // if(visted_list != null) request.setAttribute("message",visted_list); else request.setAttribute("message",null); // //forward to "hello.view" to show // request.getRequestDispatcher("hello.view").forward(request , response); } }

<HelloModel.java>

public class HelloModel { public HelloModel(){ } public Queue doHello (String url , String u_max){ WebSource newWebPage = new WebSource(); String message = newWebPage.getSource(url); if(message.equals("-1")){ return null; } Bfs newlist = new Bfs(); Queue visted_list = newlist.listGen(message); // // BFS search to the limit MAX // int oriSize = 0; int max = 20; WebSource nextWebPage = null; String nextMessage = ""; Bfs nextCraw = null; Queue next_list = null; String newLink = ""; String nextLink = ""; max = Integer.parseInt(u_max); while(visted_list.size() < max) { if(oriSize >= visted_list.size()){ return visted_list; } nextLink = visted_list.showIndexString(oriSize); nextWebPage = new WebSource(); nextMessage = nextWebPage.getSource(nextLink); //source if(nextMessage.equals("-1")){ oriSize++; continue; } nextCraw = new Bfs(); next_list = nextCraw.listGen(nextMessage); for(int i = 0 ; i < next_list.size(); i++){ try{ newLink = next_list.showIndexString(i); if(!visted_list.isVisted(newLink)){ visted_list.add(newLink); if(visted_list.size() >= max){ break; } } } catch(Exception e){ } } oriSize++; }//while-loop return visted_list; } }
<WebSource.class>

class WebSource{ URL u = null; InputStream in = null; InputStreamReader r = null; BufferedReader br = null; StringBuffer message = null; public String getSource(String url){ try { u = new URL(url); in = u.openStream(); r = new InputStreamReader(in, "UTF-8"); br = new BufferedReader(r); String tempstr = null; message = new StringBuffer(); while ((tempstr = br.readLine()) != null) { message.append(tempstr); } return message.toString(); } catch (Exception e) { e.getStackTrace(); //return "web page error or Null page"; return "-1"; } finally { try { u = null; in.close(); r.close(); br.close(); } catch (Exception e) { //return "IO error"; return "-1"; } } }//getSource }
<Queue.class>

class Queue{ private LinkedList linkedList; public Queue() { linkedList = new LinkedList(); } // // push the element in the queue // public void add(String data) { linkedList.addFirst(data); } // // pop the element in the queue // public String remove() { return linkedList.removeLast(); } // // print out all element in the queue for testing in command line // public void show(){ for(int j = linkedList.size() - 1 ; j >= 0 ; j--){ System.out.println(linkedList.get(j)); } } // // show the index string in the queue // public String showIndexString(int index){ return linkedList.get(linkedList.size() - 1 - index); } // // return the queue size // public int size(){ return linkedList.size(); } // // check the queue is empty or not. // public boolean isEmpty() { return linkedList.isEmpty(); } // // check whether the string is in the queue or not. // public boolean isVisted(String testString){ for(int j = linkedList.size() - 1 ; j >= 0 ; j--){ if(linkedList.get(j).equals(testString)){ return true; } } return false; } }
You can download all project from :
SourceForge


沒有留言:

張貼留言