A crawler can be used to make a search engine .
This is description with the crawler .
[PDF]Building a WebCrawler and Search Engine
First , the crawler in servlet has a framework as below :
This project consists of 1 main directory, 4 sub directories , 1 main
page(html) , 1 servlet DD , 4 java files , 7 classes , and 1 document
file as the list below:
- [hw5] --- index.html
- readme.txt
- [WEB-INF] --- web.xml
- [classes] --- [mvc] --- HelloController.java
- HelloController.class
- HelloModel.java
- HelloMOdel.class
- HelloView.java
- HelloView.class
- HelloResult.java
- HelloResult.class
- Bfs.class
- Queue.class
- WebSource.class
- [hw5]
- [WEB-INF]
- [classes]
- [mvc]
- index.html ( first page for crawler )
- web.xml ( the configuraion of all servlet )
- HelloController.java ( process the HTTP request and response )
- HelloModel.java ( main process and crawler , url match )
- HelloView.java ( show the result of crawler and search )
- HelloResult.java ( show the search result )
- readme.txt ( document for homework 5 )
key code of this servlet , with a not perfect MVC framweork .
<HelloController.java>
//
// controller name
//
@WebServlet ("/hello.do")
public class HelloController extends HttpServlet {
        private HelloModel model = new HelloModel();
        @Override
        protected void doGet (HttpServletRequest request ,HttpServletResponse response ) throws ServletException , IOException {
                //
                //get request parameter
                //
                String url = request.getParameter("url");
                String max = request.getParameter("max");
                //
                //call "doHello" method in "HelloModel" Object to process attribute "name"
                //
                Queue visted_list = model.doHello(url,max);
                //
                //let result "message" become to request object and attribute.
                //
                if(visted_list != null)
                        request.setAttribute("message",visted_list);
                else
                        request.setAttribute("message",null);
                //
                //forward to "hello.view" to show
                //
                request.getRequestDispatcher("hello.view").forward(request , response);
        }
}
<HelloModel.java>
public class HelloModel {
        public HelloModel(){
        }
        public Queue doHello (String url , String u_max){
                WebSource newWebPage = new WebSource();
                String message = newWebPage.getSource(url);
                if(message.equals("-1")){
                        return null;
                }
                Bfs newlist = new Bfs();
                Queue visted_list = newlist.listGen(message);
                //
                // BFS search to the limit MAX
                //
                int oriSize = 0;
                int max = 20;
                WebSource nextWebPage = null;
                String nextMessage = "";
                Bfs nextCraw = null;
                Queue next_list = null;
                String newLink = "";
                String nextLink = "";
                max = Integer.parseInt(u_max);
                while(visted_list.size() < max)
                {
                        if(oriSize >= visted_list.size()){
                                return visted_list;
                        }
                        nextLink = visted_list.showIndexString(oriSize);
                        nextWebPage = new WebSource();
                        nextMessage = nextWebPage.getSource(nextLink); //source
                        if(nextMessage.equals("-1")){
                                oriSize++;
                                continue;
                        }
                        nextCraw = new Bfs();
                        next_list = nextCraw.listGen(nextMessage);
                        for(int i = 0 ; i < next_list.size(); i++){
                                try{
                                        newLink = next_list.showIndexString(i);
                                        if(!visted_list.isVisted(newLink)){
                                                visted_list.add(newLink);
                                                if(visted_list.size() >= max){
                                                        break;
                                                }
                                        }
                                }
                                catch(Exception e){
                                }
                        }
                        oriSize++;
                }//while-loop
                return visted_list;
        }
}
<WebSource.class>
class WebSource{
        URL u = null;
        InputStream in = null;
        InputStreamReader r = null;
        BufferedReader br = null;
        StringBuffer message = null;
        public String getSource(String url){
                try {
                        u = new URL(url);
                        in = u.openStream();
                        r = new InputStreamReader(in, "UTF-8");
                        br = new BufferedReader(r);
                        String tempstr = null;
                        message = new StringBuffer();
                        while ((tempstr = br.readLine()) != null) {
                                message.append(tempstr);
                        }
                        return message.toString();
                }
                catch (Exception e) {
                        e.getStackTrace();
                        //return "web page error or Null page";
                        return "-1";
                }
                finally {
                        try {
                                u = null;
                                in.close();
                                r.close();
                                br.close();
                        }
                        catch (Exception e) {
                                //return "IO error";
                                return "-1";
                        }
                }
        }//getSource
}
<Queue.class>
class Queue{
        private LinkedList linkedList;
        public Queue() {
                linkedList = new LinkedList();
        }
        //
        // push the element in the queue
        //
        public void add(String data) {
                linkedList.addFirst(data);
        }
        //
        // pop the element in the queue
        //
        public String remove() {
                return linkedList.removeLast();
        }
        //
        // print out all element in the queue for testing in command line
        //
        public  void show(){
                for(int j = linkedList.size() - 1 ; j >= 0 ; j--){
                        System.out.println(linkedList.get(j));
                }
        }
        //
        // show the index string in the queue
        //
        public String showIndexString(int index){
                return linkedList.get(linkedList.size() - 1 - index);
        }
        //
        // return the queue size
        //
        public int size(){
                return linkedList.size();
        }
        //
        // check the queue is empty or not.
        //
        public boolean isEmpty() {
                return linkedList.isEmpty();        
        }
        //
        // check whether the string is in the queue or not.
        //
        public boolean isVisted(String testString){
                for(int j = linkedList.size() - 1 ; j >= 0 ; j--){
                        if(linkedList.get(j).equals(testString)){
                                return true;
                        }
                }
                return false;
        }
}
  
You can download all project from :
SourceForge
沒有留言:
張貼留言