基本信息
源码名称:perl脚本爬虫程序,支持爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等
源码大小:0.18M
文件格式:.rar
开发语言:Perl
更新时间:2013-01-08
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
采用perl脚本写的爬虫程序,可以爬取北大未名bbs、163新闻、ifeng新闻、猫扑论坛、sina新闻等
抓取代码一:
#!/usr/bin/perl -w package Parser_ifeng; use Exporter; our @ISA=qw(Exporter); our @EXPORT=qw(parser_ifeng); #use strict; use warnings; use LWP::UserAgent; use LWP::Simple; use HTML::TokeParser; use Encode; use utf8; use Time::Local; use Mysql_conn; #binmode(STDIN, ':encoding(utf8)'); binmode(STDOUT, ':encoding(utf8)'); #binmode(STDERR, ':encoding(utf8)'); my $workdir="/home/webscience/proj/"; my $pmdir="$workdir/pm/"; my $logdir="$workdir/log/"; my $logfile="$logdir/ifeng.log"; my $logfp; my $tmpdir="$workdir/tmp/"; my $tmpfile="$tmpdir/ifeng.tmp"; my $tmpfile1="$tmpdir/ifeng1.tmp"; my %board_list=(); my $main_url="http://www.tianya.cn/bbs/index.shtml"; sub parser_ifeng { #初始化日志文件 open($logfp,">>$logfile") or die "can't open file:$!"; $board_list{'点击排行榜'}="http://news.ifeng.com/toprank/day/"; $board_list{'评论排行榜'}="http://news.ifeng.com/comment/day/"; $board_list{'表情_感动'}="http://news.ifeng.com/mood/day/1/"; $board_list{'表情_同情'}="http://news.ifeng.com/mood/day/2/"; $board_list{'表情_无聊'}="http://news.ifeng.com/mood/day/3/"; $board_list{'表情_愤怒'}="http://news.ifeng.com/mood/day/4/"; $board_list{'表情_搞笑'}="http://news.ifeng.com/mood/day/5/"; $board_list{'表情_难过'}="http://news.ifeng.com/mood/day/6/"; $board_list{'表情_高兴'}="http://news.ifeng.com/mood/day/7/"; $board_list{'表情_路过'}="http://news.ifeng.com/mood/day/8/"; my $key; my $value; local %url_hash;#用于去除重复的新闻 while(($key, $value) = each %board_list){ my %board; $board{'board_name'}=$key; $board{'board_url'}=$value; $board{'parent_name'}="凤凰网"; #获取版块id,若是新版块,则写入数据库 #------ my ($fid, $parent_id) = insert2forum(%board); $board{'board_id'} = $fid; $board{'parent_id'} = $parent_id; parser_board(%board); } phash(%board_list);#test } sub parser_board(%) { my %attr=(); my $token; my $page; my $p; my %board=@_; my $url=$board{'board_url'}; my @poster_list; my $tmp; my $i=0; #获取当前时间 my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; #unix时间 local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900); print "board_url=$url\n"; $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #解析新闻列表 while ($token = $p->get_tag("div")) { %attr=%{$token->[1]}; if(!$attr{'class'} || $attr{'class'} ne 'listPublic') {next;} $token = $p->get_tag("a"); %attr=%{$token->[1]}; #去除重复的新闻 my $href=$attr{'href'}; if($url_hash{"$href"}) {next; } $i ; $poster_list[$i]->{'poster_url'}=$attr{'href'}; $poster_list[$i]->{'title_name'}=$p->get_trimmed_text("/a"); $poster_list[$i]->{'board_name'}=$board{'board_name'}; $poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id $poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id $url_hash{"$href"}=$poster_list[$i]->{'title_name'}; # phash(%{$poster_list[$i]});exit; } $i=1; foreach $poster (@poster_list) { if(!$poster || $poster eq ""){next;} phash(%$poster); my ($code, $poster, $reply) = parser_poster(\%$poster); #if($code == 0){ # insert2topic($board{'board_id'}, \%$poster, \@$reply); #} $i ; } } #获取主贴信息 sub parser_poster { my %attr=(); my $page; my $tmp; my $p; my $token; my $poster=$_[0]; local @reply; my $url=$poster->{'poster_url'}; $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #定位到论坛开始 主贴内容开始标识<div id="artical_sth"> while($token=$p->get_tag('div')){ %attr=%{$token->[1]}; if($attr{'id'} && $attr{'id'} eq 'artical_sth') {last;} } if(!$token){ wlog("$url 格式错误,跳过此帖"); return -1; } #抓取发帖时间 $token=$p->get_tag('span'); $tmp=$p->get_trimmed_text('/span'); $pattern='([\d]*)年([\d]*)月([\d]*)日 ([\d]*):([\d]*)'; if(!($tmp =~ m/$pattern/)){ wlog("$url 时间获取失败,跳过此帖"); return -1; } $poster->{'post_time'}="$1-$2-$3 $4:$5:00"; #抓取新闻来源 $token=$p->get_tag('span'); $poster->{'info_source'}=$p->get_trimmed_text('/span'); #抓取评论数 $token=$p->get_tag('span'); $poster->{'reply_num'}=$p->get_trimmed_text('/span'); #查找主贴内容 定位到<div id="clear"> while($token=$p->get_token()){ if($token->[0] eq "C"&& $token->[1] eq "<!--mainContent begin-->" ) {last;} } if(!$token){ wlog("$url 抓取内容失败,跳过此帖"); return -1; } $poster->{'content'}=$p->get_trimmed_text('div'); #将帖子内容和回复数据写入数据库 #insert2topic("topic", %$poster); phash(%$poster);#test #sqldeal(\%$poster,\@reply); insert2news(\%$poster); return 0; } #获取回复信息 sub get_reply() { my $p=$_[0]; my $tmp; my $page; my $i=1; my $url; my %attr; my %attr1; my $token; my $token1; R_BEGIN: while($token=$p->get_tag('table','div')){#'td')){ %attr=%{$token->[1]}; #<td id="pageDivBottom">,定位下一页url if($token->[0] eq 'div' && $attr{'id'} && $attr{'id'} eq 'adsp_content_banner_2'){ #### if($token->[0] eq 'td' && $attr{'id'} && $attr{'id'} eq 'pageDivBottom'){ while($token=$p->get_tag('a')){ if($p->get_trimmed_text('/a') eq '下一页'){ %attr=%{$token->[1]}; $url=$attr{'href'}; $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #重新定位到<div id="pContentDiv"> while($token=$p->get_tag('div')){ %attr=%{$token->[1]}; if($attr{'id'} && $attr{'id'} eq 'pContentDiv') {next R_BEGIN;} } } } last R_BEGIN;#没有下一页,已经是最后一页,退出R_BEGIN }elsif($token->[0] eq 'div'){ next; } ##抓取回复, get_tag('table') #获取回帖用户id $p->get_tag('a'); $reply[$i]->{'uid'}=$p->get_trimmed_text('/a'); #获取回帖时间 $tmp=$p->get_trimmed_text('/td'); $patern='回复日期:([\d\D]*)'; if($tmp=~m/$patern/) {$reply[$i]->{'reply_time'}=$1;} #获取回帖内容 $reply[$i]->{'content'}=$p->get_trimmed_text('table','div'); $i ; } } #打印reply数组内容 sub preply(@) { my @reply=@_; my $i; for($i=1;$i<=$#reply;$i ) {print "[$i]\n";phash(%{$reply[$i]});} } #写入日志到 sub wlog($) { my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; } #打印hash表内容 sub phash(%) { my $key; my $value; my %hash =@_; my $i=1; while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test print "\n"; } sub sqldeal(\%\@) { my %poster =%{$_[0]}; my @reply =@{$_[1]}; # phash(%$poster);#test # preply(@reply);#test } 1;
猫扑采集代码二:
#!/usr/bin/perl package Parser_mop; use Exporter; our @ISA = qw{Exporter}; our @EXPORT = qw{parser_mop}; use warnings; use LWP::Simple; use HTML::TokeParser; use Encode; use utf8; use Time::Local; use Mysql_conn; binmode STDOUT,":utf8"; my $workdir = "/home/webscience/proj/"; my $logfile = "$workdir/log/mop.log"; my $logfh; my $preurl = "http://dzh2.mop.com/static/rrlistSubLeft_0_"; my $mainurl = "http://dzh2.mop.com/static/rrlistSubLeft_0_0.html"; my $token; my %board_list = (); my @poster_list = (); my %poster = (); my %attr = (); my %board = (); my $i = 0; my $tmp; sub parser_mop { my $key; my $value; $board_list{"人肉搜索"} = "http://dzh2.mop.com/static/rrlistSubLeft_0_0.html"; #initial log file open($logfh,">>$logfile") or die "cannot open file: $!\n"; while(($key, $value) = each %board_list){ my %board; $board{'parent_name'} = '猫扑'; $board{'board_name'} = $key; $board{'board_url'} = $value; my ($fid, $parent_id) = insert2forum(%board); $board{'board_id'} = $fid; $board{'parent_id'} = $parent_id; parser_board(%board); } } sub parser_board { my %attr = (); my $token; my $page; my $p; my %board = @_; my $boardurl = $board{'board_url'}; my @poster_list; my $url = $boardurl; my $backurl = ".html"; for($count = 0; $count <= 4; $count ) { my $tmpurl = "$count$backurl"; $url = "$preurl$tmpurl"; print $url; print "\n"; $page = get($url); # or die "get $url failed.\n"; if(!$page){last;} utf8::decode($page); $p = HTML::TokeParser->new(\$page); #parser news list #定位到论坛开始 $i = 0; while($token=$p->get_tag("font")){ if($p->get_trimmed_text("/font") eq '我的回答'){last;} } if(!$token){ wlog("$url 无法定位帖子列表"); return -1; } #解析帖子列表 while($token = $p->get_tag("a")){ %attr = %{$token->[1]}; $pattern = '([\d] )条'; $tmp = $p->get_trimmed_text("/a"); if($tmp =~ m/$pattern/) { $poster_list[$i]->{'poster_url'} = "$attr{'href'}"; $poster_list[$i]->{'title_name'} = $tmp; $poster_list[$i]->{'reply_num'} = $1; $poster_list[$i]->{'click_num'} = 0; $poster_list[$i]->{'board_name'} = $board{'board_name'}; $poster_list[$i]->{'board_id'} = $board{'board_id'}; $poster_list[$i]->{'parent_id'} = $board{'parent_id'}; $i ; } next; } } foreach $poster (@poster_list){ if(!$poster || $poster eq ""){next;} # phash(%$poster); #my($code, $poster, $reply) = parser_poster(%$poster); } } sub parser_poster { my %poster = @_; my $poster_url = $poster{'poster_url'}; # print "$poster_url"; my @reply; $i = 0; my $page = get($poster_url); utf8::decode($page); my $p = HTML::TokeParser->new(\$page); # print "$page"; #main poster while($token = $p->get_tag("strong")) { %attr = %{$token->[1]}; if(!$attr{'class'} || $attr{'class'} ne "red"){next;} $tmp = $p->get_trimmed_text("/strong"); @mplist = split(/:/,$tmp,2); # if(!$mplist || $mplist[0] ne "悬赏值"){next;} $pattern = ':([\d] )'; if($tmp =~ m/$pattern/){ $poster->{'mp'} = $1; } else { last;} while($token = $p->get_tag("span")) { %attr = %{$token->[1]}; if(!$attr{'id'} || $attr{'id'} ne "span_Body"){next;} } $poster->{'content'} = $p->get_trimmed_text("/span"); #post time $token = $p->get_tag("em"); $tmp = $p->get_trimmed_text("/em"); $pattern = '([\d]{4})-([\d] )-([\d] )([\s] )([\d] ):([\d] ):([\d]) '; if($tmp =~ m/$pattern/){ $poster->{'post_time'} = "$1-$2-$3 $5:$6:$7";} #post author $token = $p->get_tag("span"); $poster->{'author'} = $p->get_trimmed_text("/span"); last; } if(!$token){ wlog("$poster_url error skip this page"); return -1; } $tmpp = $p; #get reply info #reply 1 while($token = $p->get_tag("p")) { %attr = %{$token->[1]}; if(!$attr{'id'} || $attr{'id'} ne "id_topicreply0"){next;} $token = $p->get_tag("font"); if(!$token){last;} %attr = %{$token->[1]}; $reply[$i]->{'reply_time'} = $attr{'title'}; $token = $p->get_tag("span"); $reply[$i]->{'content'} = $p->get_trimmed_text("/span"); $token = $p->get_tag("span"); $token = $p->get_tag("span"); $reply[$i]->{'uid'} = $p->get_trimmed_text("/span"); $i ; last; } $p = $tmpp; while($token = $p->get_tag("hr")) { %attr = %{$token->[1]}; if(!$attr{'class'} || $attr{'class'} ne "hr2"){next;} $token = $p->get_tag("font"); if(!$token){last;} %attr = %{$token->[1]}; $reply[$i]->{'reply_time'} = $attr{'title'}; $token = $p->get_tag("span"); $reply[$i]->{'content'} = $p->get_trimmed_text("/span"); $token = $p->get_tag("span"); $token = $p->get_tag("span"); $reply[$i]->{'uid'} = $p->get_trimmed_text("/span"); $i ; } phash(%$poster); preply(@reply); #database insert2topic(\%$poster,\@reply); return 0; } #print reply content sub preply(@) { my @reply = @_; for(my $i = 0; $i < $#reply; $i ) { print "[$i]"; phash (%{$reply[$i]}); } } #print hash sub phash(%) { my $key; my $value = ""; my %hash =@_; $i=1; while(($key, $value) = each %hash){ print $i ," $key = $value \n"; } #test print "\n"; } sub wlog($) { ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; print $logfh "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; } 1;
新浪采集代码三:
#!/usr/bin/perl package Parser_mil_news_sina; use Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(parser_mil_news_sina); use warnings; #use strict; use LWP::Simple; use LWP::UserAgent; use HTML::TokeParser; use Encode; use utf8; use Time::Local; use Mysql_conn; binmode STDOUT,":utf8"; my $workdir = "/home/webscience/proj/"; my $logdir = "$workdir/log/"; my $logfile = "$logdir/news_sina.log"; my $logfh; my $preurl = "http://mil.news.sina.com.cn/"; my %board_list = (); my %board_all = (); sub parser_mil_news_sina { open($logfh,">>logfile") or die "cannot open the log file.\n"; my $url = "http://mil.news.sina.com.cn/hotnews/Daily/index.shtml"; my %board; $board{'parent_name'} = '军事论坛_新浪网'; $board{'board_name'} = '军事热门帖子'; $board{'board_url'} = $url; my ($fid, $parent_id) = insert2forum(%board); $board{'board_id'} = $fid; $board{'parent_id'} = $parent_id; parser_board(%board); } sub parser_board(%) { my %attr = (); my $token; my $page; my $p; my %board = @_; my $url = $board{'board_url'}; my @poster_list = (); # my $poster ; my $i = 0; #军事热门帖子 $page = get($url); # or die "get $url failed.\n"; utf8::decode($page); $p = HTML::TokeParser->new(\$page); #parser news list while($token = $p->get_tag("tbody")){ while($token = $p->get_token()){ if($token->[0] eq "C" && $token->[1] eq "<!-- 帖子排行列表 begin -->") {last;} } if(!$token){ wlog("$url get 帖子排行列表 failed."); return -1; } while($token = $p->get_tag("td")){ %attr = %{$token->[1]}; if(!$attr{'class'} || $attr{'class'} ne 'linkNews'){ next;} $i ; $token = $p->get_tag("a"); %attr = %{$token->[1]}; $poster_list[$i]->{'poster_url'} = $attr{'href'}; $poster_list[$i]->{'title_name'} = $p->get_trimmed_text("/a"); $token = $p->get_tag("td"); my $text = $p->get_trimmed_text("/td"); if($text =~ /:/){$poster_list[$i]->{'post_time'} = "2010-".$text; $poster_list[$i]->{'click_num'} = 0;} else {$poster_list[$i]->{'click_num'} = $text; $poster_list[$i]->{'post_time'} = "";} $poster_list[$i]->{'board_name'} = $board{'board_name'}; $poster_list[$i]->{'board_id'} = $board{'board_id'}; $poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id } } $i = 1; foreach my $poster (@poster_list){ if(!$poster || $poster eq ""){next;} phash(%$poster); my($code, $poster, $reply) = parser_poster(\%$poster); $i ; } } #get main poster content sub parser_poster { my %attr = (); my $page; my $p; my $token; my $poster = $_[0]; my $url = $poster->{'poster_url'}; # my $post_time = $poster->{'poster_url'}; $poster->{'content'} = "content missed"; $poster->{'keyword'} = "no keyword"; $poster->{'reply_num'} = 0; if($poster->{'post_time'} eq "") { exit;} #get current time my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; #unix时间 # local my $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900); $page = get($url); utf8::decode($page); $p = HTML::TokeParser->new(\$page); if(!$p){ print "$url cannot get to";return -1;} #local to author while($token = $p->get_tag("td")){ %attr = %{$token->[1]}; print "$attr{'class'}"; if(!$attr{'class'} && $attr{'class'} ne "postauthor"){ next;} $token = $p->get_tag(); $token = $p->get_tag("a"); $poster->{'author'} = $p->get_trimmed_text('/a'); print "author: $poster->{'author'}"; print "\n"; last; if(!$token){ wlog("$url get author failed."); return -1; } last; } while($token = $p->get_tag("td")){ %attr = %{$token->[1]}; if($attr{'class'} && $attr{'class'} eq "postcontent") { while($token = $p->get_tag("em")){ %attr = %{$token->[1]}; if(my $text = $p->get_trimmed_text("/em") eq "小") { $poster->{'post_time'} = $p->get_text(); last; } } #news count while($token = $p->get_tag("span")){ if($token = $p->get_tag("font")) {last;} } if(!$token){ wlog("$url get news count failed."); return -1; } $poster->{'click_num'} = $p->get_trimmed_text('/font'); if($token = $p->get_tag("font")) { $poster->{'reply_num'} = $p->get_trimmed_text('/font');} #news content while($token = $p->get_tag("p")){ %attr = %{$token->[1]}; if($attr{'align'} && $attr{'align'} eq "center") { $poster->{'content'} = $p->get_trimmed_text("font"); } } last; } } #click num > 5000 insert2news(\%$poster); phash(%$poster); } #写入日志到 sub wlog($) { ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; print $logfh "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; } #打印hash表内容 sub phash(%) { my $key; my $value; my %hash =@_; my $i=1; while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test print "\n"; } 1;
163代码四:
#!/usr/bin/perl -w package Parser_163; use Exporter; our @ISA=qw(Exporter); our @EXPORT=qw(parser_163); #use strict; use warnings; use LWP::UserAgent; use LWP::Simple; use HTML::TokeParser; use Encode; use utf8; use Time::Local; use Mysql_conn; #binmode(STDIN, ':encoding(utf8)'); binmode(STDOUT, ':encoding(utf8)'); #binmode(STDERR, ':encoding(utf8)'); my $workdir="/home/webscience/proj/"; my $pmdir="$workdir/pm/"; my $logdir="$workdir/log/"; my $logfile="$logdir/163.log"; my $logfp; my $tmpdir="$workdir/tmp/"; my $tmpfile="$tmpdir/163.tmp"; my %board_list=(); my $main_url="http://news.163.com/special/0001386F/rank_whole.html"; sub parser_163 { #初始化日志文件 open($logfp,">>$logfile") or die "can't open file:$!"; wlog("开始抓取163新闻\n"); local %url_hash;#用于去除重复的新闻 my %board; $board{'board_name'}="163新闻"; $board{'board_url'}=$main_url; $board{'parent_name'}='163新闻'; #获取版块id,若是新版块,则写入数据库 #------ my ($fid, $parent_id) = insert2forum(%board); $board{'board_id'} = $fid; $board{'parent_id'} = -1; parser_board(%board); } sub parser_board(%) { my %attr=(); my $token; my $page; my $p; my %board=@_; my $url=$board{'board_url'}; my @poster_list; my $tmp; my $i=0; #获取当前时间 my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; #unix时间 local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900); $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #解析新闻列表,定位到<div class="tabContents active"> while ($token = $p->get_tag("div")) { %attr=%{$token->[1]}; if($attr{'class'} && $attr{'class'} eq 'tabContents active') {last;} } while ($token = $p->get_tag("a","div")) { #以<div class="nav_channel_end">为终止标记 %attr=%{$token->[1]}; if($attr{'class'} && $attr{'class'} eq 'nav_channel_end') {last;} if($token->[0] eq "div") {next;} #去除重复的新闻 my $href=$attr{'href'}; if($url_hash{"$href"}) {next; } $i ; $poster_list[$i]->{'poster_url'}=$attr{'href'}; $poster_list[$i]->{'title_name'}=$p->get_trimmed_text('/a');; $poster_list[$i]->{'board_name'}=$board{'board_name'}; $poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id $poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id $url_hash{"$href"}=$poster_list[$i]->{'title_name'}; # phash(%{$poster_list[$i]}); } $i=1; foreach $poster (@poster_list) { if(!$poster || $poster eq ""){next;} my ($code, $poster, $reply) = parser_poster(\%$poster); #if($code == 0){ # insert2topic($board{'board_id'}, \%$poster, \@$reply); #} $i ; } } #获取主贴信息 sub parser_poster { my %attr=(); my $page; my $tmp; my $p; my $token; local $poster=$_[0]; local @reply; my $url=$poster->{'poster_url'}; $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #定位到论坛开始 主贴内容开始标识<div class="sourceTime" while($token=$p->get_tag('div')){ %attr=%{$token->[1]}; if($attr{'class'} && $attr{'class'} eq 'sourceTime') {last;} } if(!$token){ wlog("$url 格式错误,跳过此帖"); return -1; } #抓取新闻来源 $token=$p->get_tag('span'); $poster->{'info_source'}=$p->get_trimmed_text('/span'); #抓取发帖时间 $token=$p->get_tag('div'); $tmp=$p->get_trimmed_text('/div'); $pattern='([\d]*)年([\d]*)月([\d]*)日([\d]*):([\d]*)'; if(!($tmp =~ m/$pattern/)){ wlog("$url 时间获取失败,跳过此帖"); return -1; } $poster->{'post_time'}="$1-$2-$3 $4:$5:00"; #获取评论url,获取评论数 while($token=$p->get_tag('a')){ $tmp=$p->get_trimmed_text('/a'); if($tmp eq '我来说两句'){ %attr=%{$token->[1]}; $reply_url=$attr{'href'}; } } get_reply($reply_url); #查找主贴内容 定位到<div class="text clear" id="contentText" $p= HTML::TokeParser->new(\$page); while($token=$p->get_tag('div')){ %attr=%{$token->[1]}; if($attr{'id'} && $attr{'id'} eq 'contentText') {last;} } if(!$token){ wlog("$url 抓取内容失败,跳过此帖"); return -1; } $poster->{'content'}=$p->get_trimmed_text('script'); #将帖子内容和回复数据写入数据库 #insert2topic("topic", %$poster); #phash(%$poster);#test #preply(@reply);#exit; #sqldeal(\%$poster,\@reply); insert2news(\%$poster); return 0; } #获取回复信息 sub get_reply() { my $p; my $tmp; my $page; my $i=1; my $url=$_[0]; my $isnext=0; my %attr; my $token; #获取评论数 $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); while($token=$p->get_tag('span')){ %attr=%{$token->[1]}; #定位到<span id="ptdown"> if($attr{'id'} && $attr{'id'} eq "ptdown"){ $tmp=$p->get_trimmed_text('/span'); $tmp =~ m/共有原创评论([\d]*)条/; $poster->{'reply_num'} = $1; return 0; } } } #打印reply数组内容 sub preply(@) { my @reply=@_; my $i; for($i=1;$i<=$#reply;$i ) {print "[$i]\n";phash(%{$reply[$i]});} } #写入日志到 sub wlog($) { my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; } #打印hash表内容 sub phash(%) { my $key; my $value; my %hash =@_; my $i=1; while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test print "\n"; } sub sqldeal(\%\@) { my %poster =%{$_[0]}; my @reply =@{$_[1]}; # phash(%$poster);#test # preply(@reply);#test } 1;
搜狐采集代码五:
#!/usr/bin/perl -w package Parser_sohu; use Exporter; our @ISA=qw(Exporter); our @EXPORT=qw(parser_sohu); #use strict; use warnings; use LWP::UserAgent; use LWP::Simple; use HTML::TokeParser; use Encode; use utf8; use Time::Local; use Mysql_conn; #binmode(STDIN, ':encoding(utf8)'); binmode(STDOUT, ':encoding(utf8)'); #binmode(STDERR, ':encoding(utf8)'); my $workdir="/home/webscience/proj/"; my $pmdir="$workdir/pm/"; my $logdir="$workdir/log/"; my $logfile="$logdir/sohu.log"; my $logfp; my $tmpdir="$workdir/tmp/"; my $tmpfile="$tmpdir/sohu.tmp"; my %board_list=(); my $main_url="http://comment.news.sohu.com/djpm/"; sub parser_sohu { #初始化日志文件 open($logfp,">>$logfile") or die "can't open file:$!"; wlog("开始抓取sohu新闻\n"); local %url_hash;#用于去除重复的新闻 my %board; $board{'board_name'}="搜狐新闻"; $board{'board_url'}=$main_url; $board{'parent_name'}='搜狐新闻'; #获取版块id,若是新版块,则写入数据库 #------ my ($fid, $parent_id) = insert2forum(%board); $board{'board_id'} = $fid; $board{'parent_id'} = -1; parser_board(%board); } sub parser_board(%) { my %attr=(); my $token; my $page; my $p; my %board=@_; my $url=$board{'board_url'}; my @poster_list; my $tmp; my $i=0; #获取当前时间 my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; #unix时间 local $u_time=timelocal($t_sec,$t_min,$t_hour,$t_mday,$t_mon-1,$t_year-1900); $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #解析新闻列表,定位到<tbody> while ($token = $p->get_tag("/tbody")) { for($j=1;$j<=10;$j ){ $token = $p->get_tag("tr"); $token = $p->get_tag("a"); %attr=%{$token->[1]}; #去除重复的新闻 my $href=$attr{'href'}; if($url_hash{"$href"}) {next; } $i ; $poster_list[$i]->{'poster_url'}=$attr{'href'}; if($attr{'title'}){ $poster_list[$i]->{'title_name'}=$attr{'title'}; }else{ $poster_list[$i]->{'title_name'}=$p->get_trimmed_text('/a');; } $poster_list[$i]->{'board_name'}=$board{'board_name'}; $poster_list[$i]->{'board_id'} = $board{'board_id'}; #Add board id $poster_list[$i]->{'parent_id'} = $board{'parent_id'}; #Add board id $url_hash{"$href"}=$poster_list[$i]->{'title_name'}; # phash(%{$poster_list[$i]}); } } $i=1; foreach $poster (@poster_list) { if(!$poster || $poster eq ""){next;} my ($code, $poster, $reply) = parser_poster(\%$poster); #if($code == 0){ # insert2topic($board{'board_id'}, \%$poster, \@$reply); #} $i ; } } #获取主贴信息 sub parser_poster { my %attr=(); my $page; my $tmp; my $p; my $token; local $poster=$_[0]; local @reply; my $url=$poster->{'poster_url'}; $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); #定位到论坛开始 主贴内容开始标识<div class="sourceTime" while($token=$p->get_tag('div')){ %attr=%{$token->[1]}; if($attr{'class'} && $attr{'class'} eq 'sourceTime') {last;} } if(!$token){ wlog("$url 格式错误,跳过此帖"); return -1; } #抓取新闻来源 $token=$p->get_tag('span'); $poster->{'info_source'}=$p->get_trimmed_text('/span'); #抓取发帖时间 $token=$p->get_tag('div'); $tmp=$p->get_trimmed_text('/div'); $pattern='([\d]*)年([\d]*)月([\d]*)日([\d]*):([\d]*)'; if(!($tmp =~ m/$pattern/)){ wlog("$url 时间获取失败,跳过此帖"); return -1; } $poster->{'post_time'}="$1-$2-$3 $4:$5:00"; #获取评论url,获取评论数 while($token=$p->get_tag('a')){ $tmp=$p->get_trimmed_text('/a'); if($tmp eq '我来说两句'){ %attr=%{$token->[1]}; $reply_url=$attr{'href'}; } } get_reply($reply_url); #查找主贴内容 定位到<div class="text clear" id="contentText" $p= HTML::TokeParser->new(\$page); while($token=$p->get_tag('div')){ %attr=%{$token->[1]}; if($attr{'id'} && $attr{'id'} eq 'contentText') {last;} } if(!$token){ wlog("$url 抓取内容失败,跳过此帖"); return -1; } $poster->{'content'}=$p->get_trimmed_text('script'); #将帖子内容和回复数据写入数据库 #insert2topic("topic", %$poster); # phash(%$poster);#test #preply(@reply);#exit; #sqldeal(\%$poster,\@reply); insert2news(\%$poster); return 0; } #获取回复信息 sub get_reply() { my $p; my $tmp; my $page; my $i=1; my $url=$_[0]; my $isnext=0; my %attr; my $token; #获取评论数 $page=get($url); utf8::decode($page); $p= HTML::TokeParser->new(\$page); while($token=$p->get_tag('span')){ %attr=%{$token->[1]}; #定位到<span id="allCount"> if($attr{'id'} && $attr{'id'} eq "allCount"){ $tmp=$p->get_trimmed_text('/span'); $tmp =~ m/([\d]*)条/; $poster->{'reply_num'} = $1; return 0; } } } #打印reply数组内容 sub preply(@) { my @reply=@_; my $i; for($i=1;$i<=$#reply;$i ) {print "[$i]\n";phash(%{$reply[$i]});} } #写入日志到 sub wlog($) { my ($t_sec,$t_min,$t_hour,$t_mday,$t_mon,$t_year,$t_wday,$t_yday,$t_isdst) = localtime(); $t_year =1900; $t_mon =1; print $logfp "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; print "[$t_year-$t_mon-$t_mday $t_hour:$t_min:$t_sec] $_[0]\n"; } #打印hash表内容 sub phash(%) { my $key; my $value; my %hash =@_; my $i=1; while(($key, $value) = each %hash){ print $i ," $key = $value\n"; } #test print "\n"; } sub sqldeal(\%\@) { my %poster =%{$_[0]}; my @reply =@{$_[1]}; # phash(%$poster);#test # preply(@reply);#test } 1;