/*
*Page.h
* Created on: 2011-10-12
* Author: qiuxiong
* 功能:解析網(wǎng)頁(yè)頭信息,從網(wǎng)頁(yè)中提取鏈接信息
*/
#ifndef PAGE_H_
#define PAGE_H_
#include<string>
#include<map>
#include<vector>
#include<list>
#include"Url.h"
using namespace std;
const int ANCHOR_TEXT_LEN=256;//URL描述符的最大長(zhǎng)度
const int MAX_URL_REFERENCES=1000;//從一個(gè)網(wǎng)頁(yè)中最多提取的URL數(shù)目
const int URL_REFERENCE_LEN=(URL_LEN+ANCHOR_TEXT_LEN)*MAX_URL_REFERENCES*1/2;//從一個(gè)網(wǎng)頁(yè)提取的標(biāo)識(shí)信息的最大長(zhǎng)度
enum page_type{PLAIN_TEXT,OTHER};//自定義的網(wǎng)頁(yè)類型
//保存URL信息<-->URL的描述信息[這里URL指的是為搜索準(zhǔn)備的鏈接]
struct RefLink4SE
{
char *link;//url
char *anchor_text;//url描述信息
};
//保存URL信息[這個(gè)URL指的是為歷史網(wǎng)頁(yè)存檔準(zhǔn)備的鏈接]
struct RefLink4History
{
char *link;//url
};
class CPage
{
public:
string m_sUrl;//網(wǎng)頁(yè)對(duì)應(yīng)的URL字符串
string m_sHeader;//網(wǎng)頁(yè)頭信息
int m_nLenHeader;//網(wǎng)頁(yè)頭信息的長(zhǎng)度
int m_nStatusCode;//狀態(tài)碼
int m_nContentLength;//從網(wǎng)頁(yè)頭信息中提取的網(wǎng)頁(yè)體的長(zhǎng)度,一般不是很準(zhǔn)
string m_sLocation;//網(wǎng)頁(yè)的轉(zhuǎn)向信息,可以判斷這個(gè)網(wǎng)頁(yè)是否重定向
bool m_bConnectionState;//是否支持持續(xù)鏈接Keep-Alive為true否則為false
string m_sContentEncoding;//網(wǎng)頁(yè)體的編碼
string m_sContentType;//網(wǎng)頁(yè)體的類型
string m_sCharset;//網(wǎng)頁(yè)體的字符集
string m_sTransferEncoding;//網(wǎng)頁(yè)體的傳輸編碼方式
string m_sContent;//網(wǎng)頁(yè)體信息
int m_nLenContent;//網(wǎng)頁(yè)體信息的長(zhǎng)度
string m_sContentLinkInfo;//從網(wǎng)頁(yè)體中提取出包含超鏈接信息的標(biāo)識(shí),例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>
string m_sLinkInfo4SE;//再?gòu)膍_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>標(biāo)識(shí)信息
int m_nLenLinkInfo4SE;//m_sLinkInfo4SE的長(zhǎng)度
string m_sLinkInfo4History;//再?gòu)膍_sContentLinkInfo提取出<img src="www.baidu.com">標(biāo)識(shí)信息
int m_nLenLinkInfo4History;//m_sLinkInfo4History的長(zhǎng)度
RefLink4SE m_RefLink4SE[MAX_URL_REFERENCES];//保存URL信息<-->URL的描述信息[這里URL指的是為搜索準(zhǔn)備的鏈接]
int m_nRefLink4SENum;//上面數(shù)組的長(zhǎng)度
RefLink4History m_RefLink4History[MAX_URL_REFERENCES/2];//保存URL信息[這個(gè)URL指的是為歷史網(wǎng)頁(yè)存檔準(zhǔn)備的鏈接]
int m_nRefLink4HistoryNum;//上面數(shù)組的長(zhǎng)度
map<string,string>m_mapLink4SE;//保存URL信息<-->URL的描述信息[這里URL指的是為搜索準(zhǔn)備的鏈接]-----當(dāng)然了這個(gè)map容器的作用主要是刪除一個(gè)網(wǎng)頁(yè)中相同的URL
vector<string>m_vecLink4History;//保存URL信息--當(dāng)然了這個(gè)vector容器的作用主要是刪除一個(gè)網(wǎng)頁(yè)中相同的URL
enum page_type m_eType;//網(wǎng)頁(yè)的類型
public:
CPage();
CPage(string strUrl,string strLocation,char *header,char *body,int nLenBody);
~CPage();
void ParseHeaderInfo(string header);//解析網(wǎng)頁(yè)頭信息
bool ParseHyperLinks();//從網(wǎng)頁(yè)中提取出鏈接信息
bool NormalizeUrl(string &strUrl);//判斷strUrl是不是正規(guī)的url
bool IsFilterLink(string plink);//判斷plink鏈接是不是要過(guò)濾掉
private:
//解析網(wǎng)頁(yè)頭信息
void GetStatusCode(string header);//得到狀態(tài)碼
void GetContentLength(string header);//從網(wǎng)頁(yè)頭信息中提取的網(wǎng)頁(yè)體的長(zhǎng)度,一般不是很準(zhǔn)
void GetConnectionState(string header);//得到連接狀態(tài)
void GetLocation(string header);//得到重定向信息
void GetCharset(string header);//得到字符集
void GetContentEncoding(string header);//得到網(wǎng)頁(yè)體編碼
void GetContentType(string header);//得到網(wǎng)頁(yè)體類型
void GetTransferEncoding(string header);//得到網(wǎng)頁(yè)體的傳輸編碼方式
//從網(wǎng)頁(yè)中提取出鏈接
bool GetContentLinkInfo();//從網(wǎng)頁(yè)體中提取出包含超鏈接信息的標(biāo)識(shí),例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>
bool GetLinkInfo4SE();//再?gòu)膍_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>標(biāo)識(shí)信息
bool GetLinkInfo4History();//再?gòu)膍_sContentLinkInfo提取出<img src="www.baidu.com">標(biāo)識(shí)信息
bool FindRefLink4SE();//最終得到為搜索引擎準(zhǔn)備的超鏈接
bool FindRefLink4History();//最終得到為歷史網(wǎng)頁(yè)存檔準(zhǔn)備的超鏈接
};
#endif /* PAGE_H_ */
/*
* Page.cpp
* Created on: 2011-10-12
* Author: qiuxiong
功能:解析網(wǎng)頁(yè)頭信息,從網(wǎng)頁(yè)中提取鏈接信息
*/
#include<iostream>
#include<cstdio>
#include<cstring>
#include<string>
#include<map>
#include<vector>
#include<iterator>
#include"Url.h"
#include"Page.h"
#include"StrFun.h"
//無(wú)參構(gòu)造函數(shù)
CPage::CPage()
{
//
m_nStatusCode=0;
m_nContentLength=0;
m_sLocation="";
m_bConnectionState=false;
m_sContentEncoding="";
m_sContentType="";
m_sCharset="";
m_sTransferEncoding="";
m_sContentLinkInfo="";
m_sLinkInfo4SE="";
m_sLinkInfo4History="";
m_nRefLink4SENum=0;
m_nRefLink4HistoryNum=0;
m_eType=PLAIN_TEXT;
for(int i=0;i<MAX_URL_REFERENCES;i++)
{
m_RefLink4SE[i].link=NULL;
m_RefLink4SE[i].anchor_text=NULL;
if(i<MAX_URL_REFERENCES/2)
m_RefLink4History[i].link=NULL;
}
}
//帶參構(gòu)造函數(shù)
CPage::CPage(string strUrl,string strLocation,char *header,char *body,int nLenBody)
{
//初始化成員變量
m_nStatusCode=0;
m_nContentLength=0;
m_sLocation="";
m_bConnectionState=false;
m_sContentEncoding="";
m_sContentType="";
m_sCharset="";
m_sTransferEncoding="";
m_sContentLinkInfo="";
m_sLinkInfo4SE="";
m_sLinkInfo4History="";
m_nRefLink4SENum=0;
m_nRefLink4HistoryNum=0;
m_eType=PLAIN_TEXT;
//超鏈接信息以及超鏈接的描述信息初始化都為空
for(int i=0;i<MAX_URL_REFERENCES;i++)
{
m_RefLink4SE[i].link=NULL;
m_RefLink4SE[i].anchor_text=NULL;
if(i<MAX_URL_REFERENCES/2)
m_RefLink4History[i].link=NULL;
}
//將構(gòu)造函數(shù)傳入的參數(shù)賦值給成員變量
m_sUrl=strUrl;//網(wǎng)頁(yè)對(duì)應(yīng)的URL
m_sLocation=strLocation;//網(wǎng)頁(yè)重定向的URL,沒(méi)有重定向則傳入為空,否則傳入重定向的URL信息
m_sHeader=header;//網(wǎng)頁(yè)的頭信息
m_nLenHeader=strlen(header);//網(wǎng)頁(yè)頭信息的長(zhǎng)度
m_sContent.assign(body,nLenBody);//網(wǎng)頁(yè)體信息,用body所指向數(shù)組的前nLenBody個(gè)字符副本替換m_sContent
m_nLenContent=nLenBody;//網(wǎng)頁(yè)體信息的長(zhǎng)度
}
CPage::~CPage()
{
}
//解析網(wǎng)頁(yè)頭信息---調(diào)用8個(gè)私有的成員函數(shù)
void CPage::ParseHeaderInfo(string headerBuf)
{
GetStatusCode(headerBuf);
GetContentLength(headerBuf);
GetConnectionState(headerBuf);
GetLocation(headerBuf);
GetCharset(headerBuf);
GetContentEncoding(headerBuf);
GetContentType(headerBuf);
GetTransferEncoding(headerBuf);
}
//得到狀態(tài)碼
void CPage::GetStatusCode(string headerBuf)
{
//例如:
//HTTP/1.0 200 OK 200就是狀態(tài)碼
CStrFun::Str2Lower(headerBuf,headerBuf.length());
char *charIndex=(char *)strstr(headerBuf.c_str(),"http/");//在字符串headerBuf中查找第一出現(xiàn)"http/"的位置
if(charIndex==NULL)
{
m_nStatusCode=-1;
return;
}
while(*charIndex!=' ')
charIndex++;
charIndex++;
int ret=sscanf(charIndex,"%i",&m_nStatusCode);//格式化字符串輸入
if(ret!=1)
m_nStatusCode=-1;
}
//從網(wǎng)頁(yè)頭信息中提取的網(wǎng)頁(yè)體的長(zhǎng)度,一般不是很準(zhǔn)
void CPage::GetContentLength(string headerBuf)
{
//例如:
//content-length: 21237 21237就是網(wǎng)頁(yè)體的長(zhǎng)度,這個(gè)屬性值是服務(wù)器返回的,不一定正確
CStrFun::Str2Lower(headerBuf,headerBuf.length());
char *charIndex=(char *)strstr(headerBuf.c_str(),"content-length");
if(charIndex==NULL)
return;
while(*charIndex!=' ')
charIndex++;
charIndex++;
int ret=sscanf(charIndex,"%i",&m_nContentLength);
if(ret!=1)
m_nContentLength=-1;
}
//得到重定向信息
void CPage::GetLocation(string headerBuf)
{
//例如:
//location: http://www.baidu.com/ http://www.baidu.com/ 就是這個(gè)m_sUrl的重定向信息
string::size_type pre_idx,idx;
const string delims("\r\n");
string strBuf=headerBuf;
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("location:");
if(idx!=string::npos)
{
pre_idx=idx+sizeof("location: ")-1;
idx=headerBuf.find_first_of(delims,pre_idx);
if(idx!=string::npos)
m_sLocation=strBuf.substr(pre_idx,idx-pre_idx);
}
}
//得到網(wǎng)頁(yè)字符集
void CPage::GetCharset(string headerBuf)
{
//例如:
//charset=gb2312; gb2312就是這個(gè)網(wǎng)頁(yè)的字符集
string::size_type pre_idx,idx;
const string delims(" \",;>");
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("charset=");
if(idx!=string::npos)
{
m_sCharset=headerBuf.substr(idx+sizeof("charset=")-1);
}
headerBuf=m_sContent;
headerBuf=headerBuf.substr(0,2024);
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("charset=");
if(idx!=string::npos)
{
pre_idx=idx+sizeof("charset=")-1;
idx=headerBuf.find_first_of(delims,pre_idx);
m_sCharset=headerBuf.substr(pre_idx,idx-pre_idx);
}
}
//得到網(wǎng)頁(yè)體編碼
void CPage::GetContentEncoding(string headerBuf)
{
//例如:
//content-encoding: x-compress x-compress就是網(wǎng)頁(yè)體的編碼
string::size_type pre_idx,idx;
const string delims("\r\n");
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("content-encoding:");
if(idx!=string::npos)
{
pre_idx=idx+sizeof("content-encoding: ")-1;
idx=headerBuf.find_first_of(delims,pre_idx);
if(idx!=string::npos)
m_sContentEncoding=headerBuf.substr(pre_idx,idx-pre_idx);
}
}
//得到連接狀態(tài)
void CPage::GetConnectionState(string headerBuf)
{
//例如:
//Connection: Keep-Alive 或者Connection: Close 如果是Keep-Alive則鏈接狀態(tài)為true支持持續(xù)連接 否為false鏈接關(guān)閉,不支持持續(xù)連接
string::size_type pre_idx,idx;
const string delims(";\r\n");
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("connection:");
if(idx!=string::npos)
{
pre_idx=idx+sizeof("connection: ")-1;
idx=headerBuf.find_first_of(delims,pre_idx);
if(idx!=string::npos)
{
string str=headerBuf.substr(pre_idx,idx-pre_idx);
if(str=="keep-alive")
m_bConnectionState=true;
}
}
}
//得到網(wǎng)頁(yè)體類型
void CPage::GetContentType(string headerBuf)
{
//例如:
//content-type: imge/gif imge/gif就是網(wǎng)頁(yè)體的類型
string::size_type pre_idx,idx;
const string delims(";\r\n");
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("content-type:");
if(idx!=string::npos)
{
pre_idx=idx+sizeof("content-type: ")-1;
idx=headerBuf.find_first_of(delims,pre_idx);
if(idx!=string::npos)
m_sContentType=headerBuf.substr(pre_idx,idx-pre_idx);
}
}
//得到網(wǎng)頁(yè)體的傳輸編碼方式
void CPage::GetTransferEncoding(string headerBuf)
{
//例如:
//transfer-encoding: gzip gzip就是傳輸編碼方式
string::size_type pre_idx,idx;
const string delims(";\r\n");
CStrFun::Str2Lower(headerBuf,headerBuf.length());
idx=headerBuf.find("transfer-encoding:");
if(idx!=string::npos)
{
pre_idx=idx+sizeof("transfer-encoding: ")-1;
idx=headerBuf.find_first_of(delims,pre_idx);
if(idx!=string::npos)
m_sTransferEncoding=headerBuf.substr(pre_idx,idx-pre_idx);
}
}
//判斷一個(gè)URL是不是應(yīng)該過(guò)濾,要過(guò)濾返回true否則返回false
bool CPage::IsFilterLink(string plink)
{
if(plink.empty())//空的URL肯定是要過(guò)濾的
return true;
if(plink.size()>URL_LEN)//URL的長(zhǎng)度超過(guò)了我們定義的長(zhǎng)度256肯定是要過(guò)濾的
return true;
string link=plink,tmp;
CStrFun::Str2Lower(link,link.length());//link字符串中的字母全部變成小寫(xiě)
string::size_type idx=0;
//URL中出現(xiàn)2個(gè)'?'字符要過(guò)濾
tmp=link;
idx=tmp.find("?");
if(idx!=string::npos)//第一次出現(xiàn)'?'字符
{
tmp=tmp.substr(idx+1);
idx=tmp.find("?");
if(idx!=string::npos)//第二次出現(xiàn)'?'字符
return true;
}
//先后出現(xiàn)'-'和'+'字符要過(guò)濾
tmp=link;
idx=tmp.find("-");
if(idx!=string::npos)
{
tmp=tmp.substr(idx+1);
idx=tmp.find("+");
if(idx!=string::npos)
return true;
}
//出現(xiàn)2個(gè)'&'字符要過(guò)濾
tmp=link;
idx=tmp.find("&");
if(idx!=string::npos)
{
tmp=tmp.substr(idx+1);
idx=tmp.find("&");
if(idx!=string::npos)
return true;
}
//出現(xiàn)2個(gè)"http://"字符要過(guò)濾
tmp=link;
idx=tmp.find("http://");
if(idx!=string::npos)
{
tmp=tmp.substr(idx+1);
idx=tmp.find("http://");
if(idx!=string::npos)
return true;
}
//出現(xiàn)2個(gè)"http"要過(guò)濾
tmp=link;
idx=tmp.find("http");
if(idx!=string::npos)
{
tmp=tmp.substr(idx+1);
idx=tmp.find("http");
if(idx!=string::npos)
return true;
}
//出現(xiàn)2個(gè)"misc"要過(guò)濾
tmp=link;
idx=tmp.find("misc");
if(idx!=string::npos)
{
tmp=tmp.substr(idx+1);
idx=tmp.find("misc");
if(idx!=string::npos)
return true;
}
//出現(xiàn)2個(gè)"ipb"要過(guò)濾
tmp=link;
idx=tmp.find("ipb");
if(idx!=string::npos)
{
tmp=tmp.substr(idx+1);
idx=tmp.find("ipb");
if(idx!=string::npos)
return true;
}
const char *filter_str[]=
{
"cgi-bin","htbin","linder","srs5","uin-cgi",
"uhtbin","snapshot","=+","=-","script",
"gate","search","clickfile","data/scop","names",
"staff/","enter","user","mail","pst?",
"find?","ccc?","fwd?","tcon?","&",
"counter?","forum","cgisirsi","{","}",
"proxy","login","00.pl?","sciserv.pl","sign.asp",
"<",">","review.asp?","result.asp?","keyword",
"\"","'","php?s=","error","showdate",
"niceprot.pl?","volue.asp?id",".css",".asp?month","prot.pl?",
"msg.asp","register.asp", "database","reg.asp","qry?u",
"p?msg","tj_all.asp?page", ".plot.","comment.php","nicezyme.pl?",
"entr","compute-map?", "view-pdb?","list.cgi?","lists.cgi?",
"details.pl?","aligner?","raw.pl?","interface.pl?","memcp.php?",
"member.php?","post.php?","thread.php","bbs/","/bbs"
};
int filter_str_num = 75;
for(int i=0;i<filter_str_num;i++)
if(link.find(filter_str[i])!=string::npos)//說(shuō)明找到了上述字符串要過(guò)濾
return true;
return false;
}