ACE之网页链接提取程序的方法

ACE之网页链接提取程序的方法

一个简单得提取网页上的链接的小程序

#include<iostream>
#include<fstream>

#include"boost/regex.hpp"

#include"ace/INET_Addr.h"
#include"ace/SOCK_Stream.h"
#include"ace/SOCK_Connector.h"
#include "ace/Time_Value.h"
#include"ace/Log_Msg.h"

using namespace std;

//getHtml得到网页的内容
int getHtml(const char* ipaddr, char* recvbuf, unsigned len,char *pathname)
{
ACE_INET_Addr servaddr(80,ipaddr);

ACE_SOCK_Connector connector;
ACE_SOCK_Stream peer;
ACE_Time_Value sendTime(0,100);
ACE_Time_Value recvTime(0,1000);
if(connector.connect(peer, servaddr) == -1)
ACE_ERROR_RETURN((LM_ERROR,ACE_TEXT("%p/n"),ACE_TEXT("connect")),1);

char buff[512];
servaddr.addr_to_string(buff, 512);
std::cout<<buff<<std::endl;

iovec iov[3];

//填写HTTP请求命令

iov[0].iov_base = "GET ";
iov[0].iov_len = 4;

iov[1].iov_base = pathname;
iov[1].iov_len = strlen(pathname);

iov[2].iov_base =" HTTP/1.0/r/n/r/n";
iov[2].iov_len = 13;

if(peer.sendv_n(iov,3,&sendTime)== -1)
ACE_ERROR_RETURN((LM_ERROR,ACE_TEXT("(%p|%t) error in sending"),
ACE_TEXT("query to status server/n")),1);

ACE_OS::sleep(1);

if(peer.recv(recvbuf,len, &recvTime) == -1)
ACE_ERROR_RETURN((LM_ERROR,ACE_TEXT("(%p|%t) error in recving"),
ACE_TEXT("query to status server/n")),1);

return 0;
}

int ACE_TMAIN(int argc, ACE_TCHAR ** argv)
{

if(argc < 2)
cout<<"参数个数不够"<<endl;

const unsigned int BUFF_SIZE = 1024*64;
char buff[BUFF_SIZE];

char *pathname;
if(argc >=3)
{
pathname = argv[2];
}
else
{
pathname = "/";
}
getHtml(argv[1],buff,BUFF_SIZE,pathname);

cout<<buff<<endl;

boost::smatch m;
boost::regex reg("(((href)|(src))=.*?)(>)",boost::regex::icase);
string str(buff);
//寻找匹配
boost::sregex_iterator it(str.begin(),str.end(),reg);
boost::sregex_iterator end;

string filename(argv[1]);
filename += ".txt";
ofstream out(filename.c_str());

//输出到文件
for(;it != end;++it)
{
string trim = it->str();
string path(string(argv[1]) + pathname);
cout<<*it<<endl;

int pos = trim.find(' ');
if(pos >= 0)
{
trim = trim.substr(0,pos + 1);
}


pos = trim.find("http:");


if(pos >= 0)
{
trim = trim.substr(pos);
trim.erase(trim.size() - 1);
}
else
{

pos = trim.find_first_of('=');
trim = trim.substr(pos + 2);
trim.erase(trim.size() - 1);

if(trim[0] == '/')
{
string temp(trim.begin() +1 ,trim.end());
trim = temp;
}

path = "http://" + path;

if(*(path.end() - 1) == ' ')
path.erase(path.end() - 1);


trim = path + trim;
}


if(trim[(trim.size() - 1)] == '/"'||trim[(trim.size() - 1)] == '/'')
trim.erase(trim.size() - 1);

out<<trim<<endl;
}

return 0;
}

输入 程序名 wjl.scu.edu.cn /soft/

得出这些链接

http://wjl.scu.edu.cn/soft/images/logotop1.gif
http://wjl.scu.edu.cn/soft/images/menuleft.gif
http://wjl.scu.edu.cn/soft/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/music/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/soft/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/news/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/movie/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/original/
http://wjl.scu.edu.cn/soft/images/spacer.gif
http://wjl.scu.edu.cn/soft/images/search.gif
http://wjl.scu.edu.cn/soft/images/topbg1.gif
http://wjl.scu.edu.cn/soft/categories.php
http://wjl.scu.edu.cn/soft/images/soft5_r2_c1.jpg
http://wjl.scu.edu.cn/soft/images/spacer.gif
http://wjl.scu.edu.cn/soft/images/sm/smallico_r1_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=%B3%A3%D3%C3%B9%A4%BE%DF
http://wjl.scu.edu.cn/soft/images/sm/smallico_r4_c4.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=操作系统
http://wjl.scu.edu.cn/soft/images/sm/smallico_r6_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=图形图像
http://wjl.scu.edu.cn/soft/images/sm/smallico_r8_c4.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=媒体工具
http://wjl.scu.edu.cn/soft/images/sm/smallico_r10_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=硬件驱动
http://wjl.scu.edu.cn/soft/images/sm/smallico_r12_c6.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=网络工具
http://wjl.scu.edu.cn/soft/images/sm/smallico_r19_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=书籍教程
http://wjl.scu.edu.cn/soft/images/sm/smallico_r16_c2.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=休闲娱乐
http://wjl.scu.edu.cn/soft/images/sm/smallico_r20_c1.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=电脑编程
http://wjl.scu.edu.cn/soft/images/sm/smallico_r22_c3.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=其它软件
http://wjl.scu.edu.cn/soft/images/sm/smallico_r22_c3.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=软件原码
http://wjl.scu.edu.cn/soft/images/sm/smallico_r22_c3.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=原创开发者专栏
http://wjl.scu.edu.cn/soft/images/sm/smallico_r24_c4.jpg