A crawler can be used to make a search engine .
This is description with the crawler .
[PDF]Building a WebCrawler and Search Engine
First , the crawler in servlet has a framework as below :
This project consists of 1 main directory, 4 sub directories , 1 main
page(html) , 1 servlet DD , 4 java files , 7 classes , and 1 document
file as the list below:
- [hw5] --- index.html
- readme.txt
- [WEB-INF] --- web.xml
- [classes] --- [mvc] --- HelloController.java
- HelloController.class
- HelloModel.java
- HelloMOdel.class
- HelloView.java
- HelloView.class
- HelloResult.java
- HelloResult.class
- Bfs.class
- Queue.class
- WebSource.class
- [hw5]
- [classes]
- [mvc]
- index.html ( first page for crawler )
- web.xml ( the configuraion of all servlet )
- HelloController.java ( process the HTTP request and response )
- HelloModel.java ( main process and crawler , url match )
- HelloView.java ( show the result of crawler and search )
- HelloResult.java ( show the search result )
- readme.txt ( document for homework 5 )
key code of this servlet , with a not perfect MVC framweork .
// controller name
@WebServlet ("/hello.do")
public class HelloController extends HttpServlet {
private HelloModel model = new HelloModel();
protected void doGet (HttpServletRequest request ,HttpServletResponse response ) throws ServletException , IOException {
//get request parameter
String url = request.getParameter("url");
String max = request.getParameter("max");
//call "doHello" method in "HelloModel" Object to process attribute "name"
Queue visted_list = model.doHello(url,max);
//let result "message" become to request object and attribute.
if(visted_list != null)
//forward to "hello.view" to show
request.getRequestDispatcher("hello.view").forward(request , response);
public class HelloModel {
public HelloModel(){
public Queue doHello (String url , String u_max){
WebSource newWebPage = new WebSource();
String message = newWebPage.getSource(url);
return null;
Bfs newlist = new Bfs();
Queue visted_list = newlist.listGen(message);
// BFS search to the limit MAX
int oriSize = 0;
int max = 20;
WebSource nextWebPage = null;
String nextMessage = "";
Bfs nextCraw = null;
Queue next_list = null;
String newLink = "";
String nextLink = "";
max = Integer.parseInt(u_max);
while(visted_list.size() < max)
if(oriSize >= visted_list.size()){
return visted_list;
nextLink = visted_list.showIndexString(oriSize);
nextWebPage = new WebSource();
nextMessage = nextWebPage.getSource(nextLink); //source
nextCraw = new Bfs();
next_list = nextCraw.listGen(nextMessage);
for(int i = 0 ; i < next_list.size(); i++){
newLink = next_list.showIndexString(i);
if(visted_list.size() >= max){
catch(Exception e){
return visted_list;
class WebSource{
URL u = null;
InputStream in = null;
InputStreamReader r = null;
BufferedReader br = null;
StringBuffer message = null;
public String getSource(String url){
try {
u = new URL(url);
in = u.openStream();
r = new InputStreamReader(in, "UTF-8");
br = new BufferedReader(r);
String tempstr = null;
message = new StringBuffer();
while ((tempstr = br.readLine()) != null) {
return message.toString();
catch (Exception e) {
//return "web page error or Null page";
return "-1";
finally {
try {
u = null;
catch (Exception e) {
//return "IO error";
return "-1";
class Queue{
private LinkedList linkedList;
public Queue() {
linkedList = new LinkedList();
// push the element in the queue
public void add(String data) {
// pop the element in the queue
public String remove() {
return linkedList.removeLast();
// print out all element in the queue for testing in command line
public void show(){
for(int j = linkedList.size() - 1 ; j >= 0 ; j--){
// show the index string in the queue
public String showIndexString(int index){
return linkedList.get(linkedList.size() - 1 - index);
// return the queue size
public int size(){
return linkedList.size();
// check the queue is empty or not.
public boolean isEmpty() {
return linkedList.isEmpty();
// check whether the string is in the queue or not.
public boolean isVisted(String testString){
for(int j = linkedList.size() - 1 ; j >= 0 ; j--){
return true;
return false;
You can download all project from :