网络爬行者源代码介绍
网络爬行者源代码介绍
import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
import javax.swing.table.*;
//一个Web的爬行者(注:爬行在这里的意思与抓取,捕获相同)
public class SearchCrawler extends JFrame{
//最大URL保存值
private static final String[] MAX_URLS={"50","100","500","1000"};
//缓存robot禁止爬行列表
private HashMap disallowListCache=new HashMap();
//搜索GUI控件
private JTextField startTextField;
private JComboBox maxComboBox;
private JCheckBox limitCheckBox;
private JTextField logTextField;
private JTextField searchTextField;
private JCheckBox caseCheckBox;
private JButton searchButton;
//搜索状态GUI控件
private JLabel crawlingLabel2;
private JLabel crawledLabel2;
private JLabel toCrawlLabel2;
private JProgressBar progressBar;
private JLabel matchesLabel2;
//搜索匹配项表格列表
private JTable table;
//标记爬行机器是否正在爬行
private boolean crawling;
//写日志匹配文件的引用
private PrintWriter logFileWriter;
//网络爬行者的构造函数
public SearchCrawler(){
//设置应用程序标题栏
setTitle("搜索爬行者");
//设置窗体大小
setSize(600,600);
//处理窗体关闭事件
addWindowListener(new WindowAdapter(){
public void windowClosing(WindowEvent e){
actionExit();
}
});
//设置文件菜单
JMenuBar menuBar=new JMenuBar();
JMenu fileMenu=new JMenu("文件");
fileMenu.setMnemonic(KeyEvent.VK_F);
JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X);
fileExitMenuItem.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e){
actionExit();
}
});
fileMenu.add(fileExitMenuItem);
menuBar.add(fileMenu);
setJMenuBar(menuBar);
//设置搜索面板
JPanel searchPanel=new JPanel();
GridBagConstraints constraints;
GridBagLayout layout=new GridBagLayout();
searchPanel.setLayout(layout);
JLabel startLabel=new JLabel("开始URL:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(startLabel,constraints);
searchPanel.add(startLabel);
startTextField=new JTextField();
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,0,5);
layout.setConstraints(startTextField,constraints);
searchPanel.add(startTextField);
JLabel maxLabel=new JLabel("最大抓取URL数(0表示不限制):");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(maxLabel,constraints);
searchPanel.add(maxLabel);
maxComboBox=new JComboBox(MAX_URLS);
maxComboBox.setEditable(true);
constraints=new GridBagConstraints();
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(maxComboBox,constraints);
searchPanel.add(maxComboBox);
limitCheckBox=new JCheckBox("限制抓取开始URL站点");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.WEST;
constraints.insets=new Insets(0,10,0,0);
layout.setConstraints(limitCheckBox,constraints);
searchPanel.add(limitCheckBox);
JLabel blankLabel=new JLabel();
constraints=new GridBagConstraints();
constraints.gridwidth=GridBagConstraints.REMAINDER;
layout.setConstraints(blankLabel,constraints);
searchPanel.add(blankLabel);
JLabel logLabel=new JLabel("匹配日志文件:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(logLabel,constraints);
searchPanel.add(logLabel);
String file=System.getProperty("user.dir")+
System.getProperty("file.separator")+
"crawler.log";
logTextField=new JTextField(file);
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,0,5);
layout.setConstraints(logTextField,constraints);
searchPanel.add(logTextField);
JLabel searchLabel=new JLabel("搜索字符串:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(searchLabel,constraints);
searchPanel.add(searchLabel);
searchTextField=new JTextField();
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.insets=new Insets(5,5,0,0);
constraints.gridwidth=2;
constraints.weightx=1.0d;
layout.setConstraints(searchTextField,constraints);
searchPanel.add(searchTextField);
caseCheckBox=new JCheckBox("大小写敏感");
constraints=new GridBagConstraints();
constraints.insets=new Insets(5,5,0,5);
constraints.gridwidth=GridBagConstraints.REMAINDER;
layout.setConstraints(caseCheckBox,constraints);
searchPanel.add(caseCheckBox);
searchButton=new JButton("搜索");
searchButton.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e){
actionSearch();
}
});
constraints=new GridBagConstraints();
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,5,5);
layout.setConstraints(searchButton,constraints);
searchPanel.add(searchButton);
JSeparator separator=new JSeparator();
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,5,5);
layout.setConstraints(separator,constraints);
searchPanel.add(separator);
JLabel crawlingLabel1=new JLabel("爬行:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(crawlingLabel1,constraints);
searchPanel.add(crawlingLabel1);
crawlingLabel2=new JLabel();
crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,0,5);
layout.setConstraints(crawlingLabel2,constraints);
searchPanel.add(crawlingLabel2);
JLabel crawledLabel1=new JLabel("已抓取的URL数:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(crawledLabel1,constraints);
searchPanel.add(crawledLabel1);
crawledLabel2=new JLabel();
crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,0,5);
layout.setConstraints(crawledLabel2,constraints);
searchPanel.add(crawledLabel2);
JLabel toCrawlLabel1=new JLabel("爬行的URL数");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(toCrawlLabel1,constraints);
searchPanel.add(toCrawlLabel1);
toCrawlLabel2=new JLabel();
toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,0,5);
layout.setConstraints(toCrawlLabel2,constraints);
searchPanel.add(toCrawlLabel2);
JLabel progressLabel=new JLabel("正在爬行进度:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,0,0);
layout.setConstraints(progressLabel,constraints);
searchPanel.add(progressLabel);
progressBar=new JProgressBar();
progressBar.setMinimum(0);
progressBar.setStringPainted(true);
constraints=new GridBagConstraints();
constraints.gridwidth=GridBagConstraints.HORIZONTAL;
constraints.insets=new Insets(5,5,0,5);
layout.setConstraints(progressBar,constraints);
searchPanel.add(progressBar);
JLabel matchesLabel1=new JLabel("搜索匹配:");
constraints=new GridBagConstraints();
constraints.anchor=GridBagConstraints.EAST;
constraints.insets=new Insets(5,5,10,0);
layout.setConstraints(matchesLabel1,constraints);
searchPanel.add(matchesLabel1);
matchesLabel2=new JLabel();
matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
constraints=new GridBagConstraints();
constraints.fill=GridBagConstraints.HORIZONTAL;
constraints.gridwidth=GridBagConstraints.REMAINDER;
constraints.insets=new Insets(5,5,10,5);
layout.setConstraints(matchesLabel2,constraints);
searchPanel.add(matchesLabel2);
//设置匹配表
table=new JTable(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){
public boolean isCellEditable(int row,int column){
return false;
}
});
//设置匹配面板
JPanel matchesPanel=new JPanel();
matchesPanel.setBorder(BorderFactory.createTitledBorder("匹配"));
matchesPanel.setLayout(new BorderLayout());
matchesPanel.add(new JScrollPane(table),BorderLayout.CENTER);
//把面板添加到窗体上
getContentPane().setLayout(new BorderLayout());
getContentPane().add(searchPanel,BorderLayout.NORTH);
getContentPane().add(matchesPanel,BorderLayout.CENTER);
}
//处理搜索/停止按钮被点到
private void actionSearch(){
//如果停止按钮被点到,爬行标志关闭
if(crawling){
crawling=false;
return;
}
ArrayList errorList=new ArrayList();
//验证起始URL已经输入
String startUrl=startTextField.getText().trim();
if(startUrl.length()<1){
errorList.add("没有起始URL");
}else if(verifyUrl(startUrl)==null){//校验起始URL
errorList.add("非法的起始URL");
}
//校验最大URL数是否为空或者是一个数字
int maxUrls=0;
String max=((String)maxComboBox.getSelectedItem()).trim();
if(max.length()>0){
try{
maxUrls=Integer.parseInt(max);
}catch(NumberFormatException e){
}
if(maxUrls<1){
errorList.add("非法最大URL数值");
}
}
//验证匹配的日志文件已经键入
String logFile=logTextField.getText().trim();
if(logFile.length()<0){
errorList.add("未填写日志文件");
}
//验证搜索字符串已经被键入
String searchString=searchTextField.getText().trim();
if(searchString.length()<1){
errorList.add("未填写搜索字符串");
}
//如果有错,显示这些错误,然后返回
if(errorList.size()>0){
StringBuffer message=new StringBuffer();
//连接所有的错误到一个字符串中
for(int i=0;i<errorList.size();i++){
message.append(errorList.get(i));
if(i+1<errorList.size()){
message.append("/n");
}
}
showError(message.toString());
return;
}
//从起始URL移除"www"
startUrl=removeWwwFromUrl(startUrl);
//启动搜索爬行者
search(logFile,startUrl,maxUrls,searchString);
}
private void search(final String logFile,final String startUrl,
final int maxUrls,final String searchString){
//在一个新线程里开始搜索
Thread thread=new Thread(new Runnable(){
public void run(){
//当搜索正在进行时,换一个等待鼠标
setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
//禁用搜索控制面板
startTextField.setEnabled(false);
maxComboBox.setEnabled(false);
limitCheckBox.setEnabled(false);
logTextField.setEnabled(false);
searchTextField.setEnabled(false);
caseCheckBox.setEnabled(false);
//更改搜索按钮为"停止"
searchButton.setText("停止");
//重设状态
table.setModel(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){
public boolean isCellEditable(int row,int column){
return false;
}
});
updateStats(startUrl,0,0,maxUrls);
//打开匹配日志文件
try{
logFileWriter=new PrintWriter(new FileWriter(logFile));
}catch(Exception e){
showError("不能打开匹配日志文件");
return;
}
//打开正在爬行标志
crawling=true;
//执行真正的爬行
crawl(startUrl,maxUrls,limitCheckBox.isSelected(),searchString,caseCheckBox.isSelected());
//关闭正在爬行标志
crawling=false;
//关闭匹配日志文件
try{
logFileWriter.close();
}catch(Exception e){
showError("不能关闭匹配日志文件");
}
//标记搜索结束
crawlingLabel2.setText("结束");
//重新使搜索面板可用
startTextField.setEnabled(true);
maxComboBox.setEnabled(true);
limitCheckBox.setEnabled(true);
logTextField.setEnabled(true);
searchTextField.setEnabled(true);
caseCheckBox.setEnabled(true);
//将搜索按钮改回"搜索"
searchButton.setText("搜索");
//改回默认的鼠标形状
setCursor(Cursor.getDefaultCursor());
//如果搜索字符串未被发现显示一个信息
if(table.getRowCount()==0){
JOptionPane.showMessageDialog(SearchCrawler.this,"你的搜索字符串未被发现,请尝试其它","搜索字符串未被发现",JOptionPane.WARNING_MESSAGE);
}
}
});
thread.start();
}
//退出程序
private void actionExit(){
System.exit(0);
}
//校验URL格式
private URL verifyUrl(String url ){
//只允许HTTP的URL
if(!url.toLowerCase().startsWith("http://")){
return null;
}
//校验URL的格式
URL verifiedUrl=null;
try{
verifiedUrl=new URL(url);
}catch(Exception e){
return null;
}
return verifiedUrl;
}
//添加匹配到匹配表和日志文件
private void addMatch(String url){
//添加URL到匹配表
DefaultTableModel model=(DefaultTableModel)table.getModel();
model.addRow(new Object[]{url});
//添加URL到日志文件
try{
logFileWriter.println(url);
}catch(Exception e){
showError("未成功的日志匹配");
}
}
//更新爬行中状态
private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls){
crawlingLabel2.setText(crawling);
crawledLabel2.setText(""+crawled);
toCrawlLabel2.setText(""+toCrawl);
//更新进度条
if(maxUrls==-1){
progressBar.setMaximum(crawled+toCrawl);
}else{
progressBar.setMaximum(maxUrls);
}
progressBar.setValue(crawled);
matchesLabel2.setText(""+table.getRowCount());
}
//检查机器人是否允许访问获得的URL
private boolean isRobotAllowed(URL urlToCheck){
String host=urlToCheck.getHost().toLowerCase();
//从缓冲中找回服务器的不被允许列表
ArrayList disallowList=(ArrayList)disallowListCache.get(host);
//如果列表不在名单中,下载将它收入列表
if(disallowList==null){
disallowList=new ArrayList();
try{
URL robotsFileUrl=new URL("http://"+host+"/robots.txt");
//打开并读取robot文件
BufferedReader reader=new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));
//读robot文件,建立不被允许路径列表
String line;
while((line=reader.readLine())!=null){
if(line.indexOf("Disallow:")==0){
String disallowPath=line.substring("Disallow:".length());
//检查不被允许路径中如果含有注释则去除它
int commentIndex=disallowPath.indexOf("#");
if(commentIndex!=-1){
disallowPath=disallowPath.substring(0,commentIndex);
}
//移除不被允许路径前后空格
disallowPath=disallowPath.trim();
//添加不被允许路径到列表中
disallowList.add(disallowPath);
}
}
}catch(Exception e){
//假设当robot文件不存在时,所有的路径都将被允许爬行
return true;
}
}
//循环检查列表中是否包含给定的URL
String file=urlToCheck.getFile();
for(int i=0;i<disallowList.size();i++){
String disallow=(String)disallowList.get(i);
if(file.startsWith(disallow)){
return false;
}
}
return true;
}
//下载给定的URL页
private String downloadPage(URL pageUrl){
try{
//为读取打开一个到URL的连接
BufferedReader reader=new BufferedReader(new InputStreamReader(pageUrl.openStream()));
//读文件到缓冲中
String line;
StringBuffer pageBuffer=new StringBuffer();
while((line=reader.readLine())!=null){
pageBuffer.append(line);
}
return pageBuffer.toString();
}catch(Exception e){
}
return null;
}
//从一个URL中删除开头的"www",如果它存在
private String removeWwwFromUrl(String url){
int index=url.indexOf("://www");
if(index!=-1){
return url.substring(0,index+3)+url.substring(index+7);
}
return url;
}
//解析所有的页面内容找到链接
private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,boolean limitHost){
//编译链接匹配模式
Pattern p=Pattern.compile("<a//s+href//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE);
Matcher m=p.matcher(pageContents);
//建立链接匹配列表
ArrayList linkList=new ArrayList();
while(m.find()){
String link=m.group(1).trim();
//跳过空链接
if(link.length()<1){
continue;
}
//跳过页面锚记链接
if(link.charAt(0)=='#'){
continue;
}
//跳过邮件链接
if(link.indexOf("mailto:")!=-1){
continue;
}
//跳过JavaScript链接
if(link.toLowerCase().indexOf("javascript")!=-1){
continue;
}
//如果需要,加上绝对与相对URL
if(link.indexOf("://")==-1){
//处理绝对URL
if(link.charAt(0)=='/'){
link="http://"+pageUrl.getHost()+link;
//处理相对URL
}else{
String file=pageUrl.getFile();
if(file.indexOf('/')==-1){
link="http://"+pageUrl.getHost()+"/"+link;
}else{
String path=file.substring(0,file.lastIndexOf('/')+1);
link="http://"+pageUrl.getHost()+path+link;
}
}
}
//从链接移除锚记
int index=link.indexOf('#');
if(index!=-1){
link=link.substring(0,index);
}
//去除开头的"www"
link=removeWwwFromUrl(link);
//校验链接,如果非法,则跳过
URL verifiedLink=verifyUrl(link);
if(verifiedLink==null){
continue;
}
//如果是特定的,那些与起始相同的服务器的链接,则跳过
if(limitHost && !pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase())){
continue;
}
//如果它已经被捕获,则跳过
if(crawledList.contains(link)){
continue;
}
//添加链接到列表
linkList.add(link);
}
return linkList;
}
//决定获得的页面内容里是否有匹配的字符串
private boolean searchStringMatches(String pageContents,String searchString,boolean caseSensitive){
String searchContents=pageContents;
//如果是非大小写敏感,小写所有页面内容
if(!caseSensitive){
searchContents=pageContents.toLowerCase();
}
//从个别的队列中分隔字符串
Pattern p=Pattern.compile("[//s]+");
String[] terms=p.split(searchString);
//检查每一个队列是否匹配
for(int i=0;i<terms.length;i++){
if(caseSensitive){
if(searchContents.indexOf(terms[i])==-1){
return false;
}
}else{
if(searchContents.indexOf(terms[i].toLowerCase())==-1){
return false;
}
}
}
return false;
}
//执行真正的爬行,搜索搜索字符串
public void crawl(String startUrl,int maxUrls,boolean limitHost,String searchString,boolean caseSensitive){
//设置爬行列表
HashSet crawledList=new HashSet();
LinkedHashSet toCrawlList=new LinkedHashSet();
//添加开始URL到要爬行列表
toCrawlList.add(startUrl);
//循环整个要爬行列表,执行真正的爬行
while(crawling && toCrawlList.size()>0){
//如果指定过最大URL数,则检查是否达到了最大URL数
if(maxUrls!=-1){
if(crawledList.size()==maxUrls){
break;
}
}
//从底部的列表中获得URL
String url=(String)toCrawlList.iterator().next();
//从要爬行列表中移除URL
toCrawlList.remove(url);
//转换字符串URL为URL对象
URL verifiedUrl=verifyUrl(url);
//如果robots不允许访问这个URL,则跳过
if(!isRobotAllowed(verifiedUrl)){
continue;
}
//更新爬行状态
updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);
//添加页面到被爬行的列表
crawledList.add(url);
//从获得的URL下载页面
String pageContents=downloadPage(verifiedUrl);
//如果一个页面被下载成功,则找到所有的链接并比较是否包含搜索字符串
if(pageContents!=null&&pageContents.length()>0){
//从页面获得合法的链接
ArrayList links=retrieveLinks(verifiedUrl,pageContents,crawledList,limitHost);
//添加列表到被爬行列表
toCrawlList.addAll(links);
//检查搜索字符串是否存在,如果存在,则记录一个匹配
if(searchStringMatches(pageContents,searchString,caseSensitive)){
addMatch(url);
}
}
//更新爬行状态
updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);
}
}
//显示错误信息
private void showError(String message){
JOptionPane.showMessageDialog(this,message,"错误",JOptionPane.ERROR_MESSAGE);
}
public static void main(String[] args){
SearchCrawler crawler=new SearchCrawler();
crawler.show();
}
}