A crawler can be used to make a search engine .
This is description with the crawler .
[PDF]Building a WebCrawler and Search Engine
First , the crawler in servlet has a framework as below :
This project consists of 1 main directory, 4 sub directories , 1 main
page(html) , 1 servlet DD , 4 java files , 7 classes , and 1 document
file as the list below:
- [hw5] --- index.html
- readme.txt
- [WEB-INF] --- web.xml
- [classes] --- [mvc] --- HelloController.java
- HelloController.class
- HelloModel.java
- HelloMOdel.class
- HelloView.java
- HelloView.class
- HelloResult.java
- HelloResult.class
- Bfs.class
- Queue.class
- WebSource.class
- [hw5]
- [WEB-INF]
- [classes]
- [mvc]
- index.html ( first page for crawler )
- web.xml ( the configuraion of all servlet )
- HelloController.java ( process the HTTP request and response )
- HelloModel.java ( main process and crawler , url match )
- HelloView.java ( show the result of crawler and search )
- HelloResult.java ( show the search result )
- readme.txt ( document for homework 5 )
key code of this servlet , with a not perfect MVC framweork .
<HelloController.java>
//
// controller name
//
@WebServlet ("/hello.do")
public class HelloController extends HttpServlet {
private HelloModel model = new HelloModel();
@Override
protected void doGet (HttpServletRequest request ,HttpServletResponse response ) throws ServletException , IOException {
//
//get request parameter
//
String url = request.getParameter("url");
String max = request.getParameter("max");
//
//call "doHello" method in "HelloModel" Object to process attribute "name"
//
Queue visted_list = model.doHello(url,max);
//
//let result "message" become to request object and attribute.
//
if(visted_list != null)
request.setAttribute("message",visted_list);
else
request.setAttribute("message",null);
//
//forward to "hello.view" to show
//
request.getRequestDispatcher("hello.view").forward(request , response);
}
}
<HelloModel.java>
public class HelloModel {
public HelloModel(){
}
public Queue doHello (String url , String u_max){
WebSource newWebPage = new WebSource();
String message = newWebPage.getSource(url);
if(message.equals("-1")){
return null;
}
Bfs newlist = new Bfs();
Queue visted_list = newlist.listGen(message);
//
// BFS search to the limit MAX
//
int oriSize = 0;
int max = 20;
WebSource nextWebPage = null;
String nextMessage = "";
Bfs nextCraw = null;
Queue next_list = null;
String newLink = "";
String nextLink = "";
max = Integer.parseInt(u_max);
while(visted_list.size() < max)
{
if(oriSize >= visted_list.size()){
return visted_list;
}
nextLink = visted_list.showIndexString(oriSize);
nextWebPage = new WebSource();
nextMessage = nextWebPage.getSource(nextLink); //source
if(nextMessage.equals("-1")){
oriSize++;
continue;
}
nextCraw = new Bfs();
next_list = nextCraw.listGen(nextMessage);
for(int i = 0 ; i < next_list.size(); i++){
try{
newLink = next_list.showIndexString(i);
if(!visted_list.isVisted(newLink)){
visted_list.add(newLink);
if(visted_list.size() >= max){
break;
}
}
}
catch(Exception e){
}
}
oriSize++;
}//while-loop
return visted_list;
}
}
<WebSource.class>
class WebSource{
URL u = null;
InputStream in = null;
InputStreamReader r = null;
BufferedReader br = null;
StringBuffer message = null;
public String getSource(String url){
try {
u = new URL(url);
in = u.openStream();
r = new InputStreamReader(in, "UTF-8");
br = new BufferedReader(r);
String tempstr = null;
message = new StringBuffer();
while ((tempstr = br.readLine()) != null) {
message.append(tempstr);
}
return message.toString();
}
catch (Exception e) {
e.getStackTrace();
//return "web page error or Null page";
return "-1";
}
finally {
try {
u = null;
in.close();
r.close();
br.close();
}
catch (Exception e) {
//return "IO error";
return "-1";
}
}
}//getSource
}
<Queue.class>
class Queue{
private LinkedList linkedList;
public Queue() {
linkedList = new LinkedList();
}
//
// push the element in the queue
//
public void add(String data) {
linkedList.addFirst(data);
}
//
// pop the element in the queue
//
public String remove() {
return linkedList.removeLast();
}
//
// print out all element in the queue for testing in command line
//
public void show(){
for(int j = linkedList.size() - 1 ; j >= 0 ; j--){
System.out.println(linkedList.get(j));
}
}
//
// show the index string in the queue
//
public String showIndexString(int index){
return linkedList.get(linkedList.size() - 1 - index);
}
//
// return the queue size
//
public int size(){
return linkedList.size();
}
//
// check the queue is empty or not.
//
public boolean isEmpty() {
return linkedList.isEmpty();
}
//
// check whether the string is in the queue or not.
//
public boolean isVisted(String testString){
for(int j = linkedList.size() - 1 ; j >= 0 ; j--){
if(linkedList.get(j).equals(testString)){
return true;
}
}
return false;
}
}
You can download all project from :
SourceForge
沒有留言:
張貼留言