一段 perl 代码展示如何 all in one 优雅地编写一个异步爬虫

V2EX = way to explore

V2EX 是一个关于分享和探索的地方

现在注册

已注册用户请登录

这是一个创建于 807 天前的主题，其中的信息可能已经有所发展或是发生改变。

#cpan https://cpan.metacpan.org/authors/id/S/SR/SRI/Mojolicious-7.31.tar.gz
#cpanm -n [email protected]
use feature ':5.10';
use strict;
use warnings;
use utf8;
use Mojo;
use Encode qw(decode encode);
##########################################################################
$ENV{MOJO_REACTOR} = 'Mojo::Reactor::EV';

#使用 EV 具有更好的性能
my $ua = Mojo::UserAgent->new;
$ua->inactivity_timeout(60);
$ua->connect_timeout(60);
$ua->request_timeout(60);

#适当延长超时的时间，阻止过早的 http 请求失败，会有更好的性能
$ua->max_connections(1000);

#最大连接数 1000
$ua->max_redirects(0);

#阻止 http3xx 重定向
$ua->transactor->name('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0');

#使用正常浏览器的 user agent
$ua->cookie_jar->ignore( sub { 1 } );
#禁用 Mojo::UserAgent 自动处理 cookie
$ua->proxy->http('http://127.0.0.1:8080')->https('http://127.0.0.1:8080');
#使用代理服务器

##########################################################################
my @list = ();

#原始队列
my @urllist = ();

#下载队列
my $n = 0;

#下载数量
my $m = 0;

#出错数量
my $produce_num = 0;

#生产者数量
my $consumer_num = 0;

#消费者数量
my $cookie_num = 0;

#cookie 数量
my @cookielist = ();

#使用的 cookie 队列
my %cookieinvalid = ();

#失效的 cookie 散列
##########################################################################
open FILEIN, '<', "./url.txt" or die "$!";
while (<FILEIN>) {
    my $content = $_;
    chomp($content);
    $content =~ s/\r//;
    push( @list, $content );
}
close FILEIN;

#导入下载列表
##########################################################################
sub append_txt_to_file {
    my $file_name = $_[0];
    my $txt       = $_[1];
    local *FH;
    open FH, '>>', $file_name;
    print FH $txt;
    close FH;
}

sub write_txt_to_file {
    my $file_name = $_[0];
    my $txt       = $_[1];
    local *FH;
    open FH, '>', $file_name;
    print FH $txt;
    close FH;
}
my %safe_character = (
    '<'  => '＜',
    '>'  => '＞',
    ':'  => '：',
    '"'  => '＂',
    '/'  => '／',
    '\\' => '＼',
    '|'  => '｜',
    '?'  => '？',
    '*'  => '＊',
);

sub repace_safe {
    my $per_char = $_[0];
    my $one_txt  = $_[1];
    my $output_char;
    if ( exists $safe_character{$per_char} ) {
        $output_char = $safe_character{$per_char};
    }
    else {
        $output_char = $per_char;
    }
    return $output_char;
}

sub find {
    my $html_bin = $_[0];
    my $id       = $_[1];
    if ( $html_bin =~ m/<\/html>/ ) {
        return '####';
    }
    else {
        return '@@@';
    }
}
##########################################################################
sub get_multiplex {
    my $id    = $_[0];
    my $delay = Mojo::IOLoop->delay( sub { get_multiplex($id) } );

    #get_multiplex 递归迭代的开始标记
    #$id 是每一个线程（端口的序号）
    my $end = $delay->begin;
    Mojo::IOLoop->timer( 0.1 => $delay->begin );

    #每个 http 请求前暂停 0.1s
    if ( scalar @urllist == 0 ) {
        if ( $produce_num == $consumer_num ) {
            Mojo::IOLoop->stop;

            #异步循环结束
            #当队列数量为 0 ，且所有的线程数据都处理完毕的时候，终止事件循环
            #return 存在一个递归返回链，这里可以更快地结束
        }
        return;

        #这里返回后异步任务数量为 0 时，系统会自动结束异步循环，不过速度较慢
        #return 返回闭包函数的开始，并结束闭包函数，下面不开启递归自身
    }
    else {
        my $object = shift @urllist;
        $produce_num++;
        my $url      = $object;
        my $filename = $object;
        $filename =~ s/^http:\/\/www\.bing\.com\/w\///m;
        $filename =~ s/(.)/repace_safe($1)/eg;
        $filename = "./www.bing.com/" . $filename . ".html";
        if ( -e $filename ) {
            syswrite STDERR, encode( 'utf8', $n . "\t" . $object . "\t   跳过\n" );    #STDOUT 编码已改，输送到 STDOUT 会出现错误
            $consumer_num++;
            $end->();
        }
        else {
            my $build_tx = $ua->build_tx( GET => $url );
            $build_tx->req->headers->remove('Accept-Encoding');

            #阻止网页压缩，保证更好的性能
            $build_tx->req->headers->add( 'Accept'          => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' );
            $build_tx->req->headers->add( 'Accept-Language' => 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' );
            $ua->transactor->name( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:' . int( rand(900) ) . ') Gecko/' . int( rand(40000001) ) . ' Firefox/' . int( rand(900) ) . '.0' );

            #使用 2 万个 cookie
            $ua->start(
                $build_tx => sub {
                    my ( $ua, $tx ) = @_;
                    if ( !$tx->is_finished ) {
                        push( @urllist, $object );
                        syswrite STDERR, "http 传输未完成" . "\n";
                        syswrite STDERR, encode( 'utf8', $url . "\t" . $tx->error->{message} . "\n" );
                    }
                    else {
                        my $code = '';
                        $code = $tx->res->code if defined $tx->res->code;
                        if ( $code =~ /\A2/ ) {
                            my $size           = $tx->res->content->asset->size;
                            my $content_length = $tx->res->headers->to_hash->{'Content-Length'};
                            if ( ( $size == $content_length ) || !( defined $content_length ) ) {
                                my $outnum = find( $tx->res->body, $id );
                                if ( $outnum ne '@@@' ) {
                                    append_txt_to_file( "url.txt", $object . "\t" . $outnum . "\n" );
                                    write_txt_to_file( $filename, $tx->res->body );
                                    $n++;
                                    syswrite STDERR, encode( 'utf8', $n . "\t" . $object . "\n" );
                                }
                                else {
                                    $n++;
                                    syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\t 网页下载完整但未提取到数据\n" );
                                }
                            }
                            else {
                                $m++;
                                push( @urllist, $object );
                                syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\t 网页未下载完整\n" );
                            }
                        }
                        elsif ( $code =~ /\A4/ ) {
                            syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\thttp 4xx\n" );
                            push( @urllist, $object );
                            Mojo::IOLoop->timer( 0.5 => $delay->begin );

                            #http 404
                        }
                        else {
                            $m++;
                            push( @urllist, $object );
                            syswrite STDERR, encode( 'utf8', $m . "\t" . $object . "\t 未发现 http code, http 3xx, http 5xx\n" );

                            #标记失效的从 cookie http 3xx
                            Mojo::IOLoop->timer( 0.5 => $delay->begin );

                            #服务器返回 5xx ，暂停 0.5s
                            #未发现 http code, http 302, http 503
                        }
                    }
                    $consumer_num++;
                    $end->();

                    #get_multiplex 递归迭代的结束标记
                    #从这里跳转到下一个 get_multiplex
                }
            );
        }
    }
}
##########################################################################
$produce_num  = 0;
$consumer_num = 0;
@urllist      = @list;

#异步下载前的变量准备
foreach my $id ( 1 .. 50 ) { get_multiplex($id) }

#使用 50 个线程（端口）下载
#如果线程数是 100 ，限制最大 cookie 数无法生效，并且 EV 会出现错误
Mojo::IOLoop->start;

#异步循环启动
##########################################################################

17 条回复 • 2023-12-19 00:25:13 +08:00

wxf666

2022-09-15 18:47:48 +08:00

每秒大概能爬多少个页面？

dfgddgf

2022-09-15 18:49:26 +08:00

@wxf666 300M 宽带跑满，必应和百度都能坚挺，扛得住

wxf666

2022-09-15 18:55:39 +08:00

@dfgddgf 你本地测试，平均每秒能爬下来并解析多少页面呢

想看看这 perl 的效率如何

dfgddgf

2022-09-15 18:56:51 +08:00

@wxf666 300M 带宽每秒 37.5-40MB/s 下载速度，按照一个网页 0.7MB 计算，每秒可以下载 50 个。

如果网页比较小，每秒下载几百个网页轻轻松松。

别把人家服务器搞崩溃了。

爬虫学的好，牢饭吃得饱。

wxf666

2022-09-15 19:05:42 +08:00

@dfgddgf 你在上一个帖子的意思，不是『如何用最少的人工，写出速度最快的爬虫』嘛

感觉你有几亿数十亿页面要爬取解析来着。。

所以想看看你最后，是如何用最优雅的姿势，写出最能压榨机器性能的爬虫的

dfgddgf

2022-09-15 19:27:56 +08:00

@wxf666 VirtualBox 虚拟机 linux mint 安装 apache2 ，使用 84KB 的网页文件作为主页，使用上面的代码稍作修改

在 cygwin 环境执行上面的 perl 代码，重复下载本地的 84KB 的网页文件( http://192.168.1.5/index.html) 10 万次数
耗时

real 3m25.076s
user 2m5.890s
sys 0m31.780s

算下来，连同网页正则匹配，平均请求速率是 100000/205s=487.8 个 /每秒

perl 做异步爬虫够不够强大

那些说 perl 没落、过时、已死的网友，其实是不了解 perl 语言及其生态的。

wxf666

2022-09-15 19:41:24 +08:00

@dfgddgf 感觉脚本语言的网络库、正则库、网页解析库等，底层应该都是 C/C++ 实现的吧

Python 、Perl 、Ruby 速度应该差不多的

perl 好像是文本处理较为优势，听说搞生物的常用？