用SAS抓取网页内容用SAS抓取网页内容 背景: 搜索引擎对网站的收录,会影响到网站SEO,而目前没找到什么好的工具来统计被收录的情况,也不会其他程序来写爬出,将就着用sas来写(实在大材小用了)。 提取的主要字段: -URL---页面Title---收录更新时间---排列顺序---URL分类- 下面是代码(后面一部分可直接忽略): *================================================== ====================== 百度收录情况统计: 1.根据百度搜索...
';
total=compress(scan(scan(_t1,-3,">"),-2,"<"),"找到相关结果个约,");
call symput("total",total);
run;
%put total number searched=&total;
%let n=%eval(&total/&basicn+1);
%put n=&n;
%*利用循环读取百度查询结果的下一页;
%do i=1 %to &n;
%if &i=1 %then %let pnum=0;
%else %let pnum=%eval(%eval(&i-1)*&basicn);
%put pn=&pnum;
%*每次查询结果设为100(参数rn=100);
filename baidu url
"";
%*先整段读取;
data _t;
infile baidu length=len lrecl=5000;
input _t1 $varying5000. len;
if substr(_t1,1,19) in ("" "") rename=(_t1=t2));
href=scan(substr(t1,index(t1,"href="),length(t1)-index(t1,"href=")),2,"""");
title=scan(substr(t1,index(t1,"target="),length(t1)-index(t1,"target=")),3,"""");
title=substr(substr(title,2,length(title)),1,length(title)-1-length("1 %then %do;
ods listing close;
ods output CompareSummary=_out;
proc compare base=_t%eval(&i.-1) compare=_t&i.;
run;
ods output close;
ods listing;
data _out;
if _n_=1 then do;
pattern=prxparse("/(d+)/");
if missing(pattern) then do;
put "ERROR in compiling regular expression";
stop;
end;
end;
retain pattern;
length num $20.;
set _out;
call prxsubstr(pattern,batch,start,length);
if start>0 then num=substr(batch,start,length);
if index(batch,"所有比较变量都相等的观测数") then call symput("ncommon",num); run;
proc datasets lib=work nolist;
delete _out;
quit;
%put ncommon=&ncommon;
%*根据compare返回结果判断前后两页是否完全一致(程序BUG);
%*1.若一致则表明已到最后查询页;
%if &ncommon=&basicn %then %do;
%let dsn=%eval(&i.-1);
%goto next;
%end;
%else %if &i.>100 %then %do;
%let dsn=%eval(&i.-1);
%goto next;
%end;
proc datasets lib=work nolist;
delete _t;
quit;
%end;
%next:
%put dsn=&dsn;
%*将所有提取的页面数据集合并;
data _t0;
set %do j=1 %to &dsn; _t&j. %end;;
drop t1 t2;
run;
proc datasets lib=work nolist;
delete %do j=1 %to &dsn; _t&j. %end;;
quit;
%*------------------------------------------------------------------------------------
说明:判断提取当天之前的数据集存在的日期,用于比较与上次提取时新增和删除页面
--------------------------------------------------------------------------------------;
%let k=1;
%let rc=1;
%do %until(&rc=0);
%let dt=%eval(%sysfunc(inputn(&sysdate9.,date9.))-&k);*倒退k天的提取的数据集是否存在;
%let dst=&dslib..&dsout._%sysfunc(putn(&dt.,yymmddn8.));
%if %sysfunc(exist(&dst))=0 %then %do;
%let k=%eval(&k+1);
%put k=&k;
%end;
%else %let rc=0;
%end;
%put dstname=&dst;
%let dt=%sysfunc(putn(%sysfunc(inputn(&sysdate9.,date9.)),yymmddn8.));
dt_&k.=%sysfunc(putn(%eval((%sysfunc(inputn(&sysdate9,date9.)))-&k),yymmddn8.)); %put dt=&dt;
%put dt_&k.=&&dt_&k;
*-------------------------------------------------------------------------------------;
%*该data步需根据自己网站url结构来做的分类拆分,例如首页、品类页、产品页、资讯页等;
data &dslib..&dsout._&dt.;
set _t0;
mainsite=scan(substr(href,7,length(href)),1,"/");
*URI="/"||scan(substr(href,7,length(href)),2,"/");
URI="/"||substr(href,length(mainsite)+9,length(href)-length(mainsite));
pagetype=scan(scan(URI,1,"?"),-1,".");
category=prxchange("s/_(d+)//",-1,scan(scan(URI,1,"?"),1,"."));
run;
%*统计各类页面的收录数;
proc freq data=&dslib..&dsout._&dt. noprint;
table category/out=_out1;
run;
%*【新增和删除的收录页URI】;
data _null_;
rc_&k.=exist("&dslib..&dsout._&&dt_&k.");
rc=exist("&dslib..&dsout._&dt.");
call symput("rc",rc);
call symput("rc_&k.",rc_&k.);
run;
%if &rc=1 and &&rc_&k=1 %then %do;
proc sql noprint;
create table _1drop as
select URI,href
from &dslib..&dsout._&&dt_&k.
where URI not in (select distinct URI from &dslib..&dsout._&dt.)
order by URI;
quit;
proc sql noprint;
create table _2new as
select URI,href
from &dslib..&dsout._&dt.
where URI not in (select distinct URI from &dslib..&dsout._&&dt_&k.)
order by URI;
quit;
%end;
%else %if &rc=0 %then %do;
%put dataset: &dslib..&dsout._&dt. not exist!;
%end;
%else %if &&rc_&k.=0 %then %do;
%put dataset: &dslib..&dsout._&dt_&k. not exist!;
%end;
%*【收录、新增及删除页面输出到d:_put.txt文档中】;
%*说明:由于输出的html格式无法提交outlook发送邮件,改为txt文档输出;
data _null_;
file "d:_put.txt";
set _out1 end=last;
if _n_=1 then do;
put "01百度收录统计(重点页面):&dt.-&systime.";
put "-------------------------------------------------------------";
put @1 "Obs" @8 "category" @54 "COUNT";
put ;
end;
if index(category,"detail") then put @1 _n_ @8 category @50 count comma8.;*根据自己网站url定制;
if last then do;
put "-------------------------------------------------------------";
end;
run;
data _null;
infile "d:_put.txt" sharebuffers lrecl=200 end=last dlm="09"x;
file "d:_put.txt";
input id : $200.;
if last then do i=1 to n1;
set _out1 nobs=n1;
if i=1 then do;
put;put;
put "02百度收录统计(详情):&dt.-&systime.";
put "-------------------------------------------------------------";
put @1 "Obs" @8 "category" @54 "COUNT";
put;
end;
put @1 i @8 category @50 count comma8.;
if i=n1 then do;
put "-------------------------------------------------------------";
end;
end;
run;
data _null_;
infile "d:_put.txt" sharebuffers lrecl=200 end=last dlm="09"x;
file "d:_put.txt";
input id : $200.;
dsid=open("_1drop");
nobs=attrn(dsid,"nobs");
if nobs=0 then do;
if last then do;
put ;put ;
put "-------------------------------------------------------------";
put "03百度收录drop的页面数:0";
put "-------------------------------------------------------------";
end;
end;
else if nobs>0 then do;
if last then do i=1 to n1;
set _1drop nobs=n1;
if i=1 then do;
put;put;
put "03百度收录drop的页面:&dt.-&systime.";
put " 上次收录日期为:&&dt_&k.";
put "-------------------------------------------------------------";
put @1 "Obs" @8 "URI" @54 "href";
put;
end;
put @1 i @8 URI @54 href;
if i=n1 then do;
put "-------------------------------------------------------------";
end;
end;
end;
run;
data _null_;
infile "d:_put.txt" sharebuffers lrecl=200 end=last dlm="09"x;
file "d:_put.txt";
input id : $200.;
dsid=open("_2new");
nobs=attrn(dsid,"nobs");
if nobs=0 then do;
if last then do;
put ;put ;
put "-------------------------------------------------------------";
put "04百度收录新增的页面数:0";
put "-------------------------------------------------------------";
end;
end;
else if nobs>0 then do;
if last then do i=1 to n1;
set _2new nobs=n1;
if i=1 then do;
put;put;
put "04百度收录新增的页面:&dt.-&systime.";
put " 上次收录日期为:&&dt_&k.";
put "-------------------------------------------------------------";
put @1 "Obs" @8 "URI" @54 "href";
put;
end;
put @1 i @8 URI @54 href;
if i=n1 then do;
put "-------------------------------------------------------------";
end;
end;
end;
run;
%*【将输出文档以outlook附件形式发送至邮箱中】;
%*邮箱需要自行修改,发件人需配置本机outlook;
filename myemail email "tianye@***.com"
to="tianye@***.com"
subject="百度收录概况:&dt.-&systime."
attach=("d:_put.txt");
data _null_;
file myemail;
put "百度收录情况查询,详情见附件txt文档!";
put "-----------------------------------------------";
put "本报告有SAS自动发出。";
put "运行时间:&dt.-&systime.";
run;
%mend _shoulu;
%_shoulu(website=yoursite.com,dslib=shoulu,dsout=yoursite);/*
%_&dslib.(website=competitor1.com,dslib=shoulu,dsout=competitor1); %_&dslib.(website=competitor2.com,dslib=shoulu,dsout=competitor2);
该文档来自用户分享,如有侵权行为请发邮件ishare@vip.sina.com联系网站客服,我们会及时删除。
[版权声明] 本站所有资料为用户分享产生,若发现您的权利被侵害,请联系客服邮件isharekefu@iask.cn,我们尽快处理。
本作品所展示的图片、画像、字体、音乐的版权可能需版权方额外授权,请谨慎使用。
网站提供的党政主题相关内容(国旗、国徽、党徽..)目的在于配合国家政策宣传,仅限个人学习分享使用,禁止用于任何广告和商用目的。
你可能还喜欢