| 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
jopen
10年前发布

PHP爬虫_电影ftp下载地址

建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));

代码:

    <?php            declare(ticks = 1);            pcntl_signal(SIGQUIT, 'signal_handler');            pcntl_signal(SIGTERM, 'signal_handler');                    $crawlers_pid = array();            $finish_count = 0;                    //信号处理函数            function signal_handler($signal)             {                global $crawlers_pid;                if ($signal == SIGQUIT || $signal == SIGTERM)                 {                    foreach ($crawlers_pid as $pid) {                        posix_kill($pid,SIGTERM);                    }                    echo "---------- crawl task exit ----------";                    global $con;//mysql                    exit();                }            }                    //GET方式获取链接对应页面内容            function get_page_content($url)             {                $content = file_get_contents($url);                return $content;            }                    //POST方式获取链接对应页面内容            function get_page_content_by_post($url, $arr)            {                $arr = http_build_query($arr);                $opts = array (                    'http' => array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data)                );                $context = stream_context_create($opts);                $content = file_get_contents($url,false,$context);                return $content;            }                    //dy2018抓取主流程            function run_dy2018()             {                global $crawlers_pid;                global $finish_count;                $crawl_urls = array("http://www.dy2018.com/html/tv/hytv/",                "http://www.dy2018.com/html/tv/hepai/",                "http://www.dy2018.com/html/tv/gangtai/",                "http://www.dy2018.com/html/tv/oumeitv/",                "http://www.dy2018.com/html/tv/rihantv/",                "http://www.dy2018.com/html/tv/tvzz/",                "http://www.dy2018.com/0/",                "http://www.dy2018.com/1/",                "http://www.dy2018.com/2/",                "http://www.dy2018.com/3/",                "http://www.dy2018.com/4/",                "http://www.dy2018.com/5/",                "http://www.dy2018.com/6/",                "http://www.dy2018.com/7/",                "http://www.dy2018.com/8/",                "http://www.dy2018.com/9/",                "http://www.dy2018.com/10/",                "http://www.dy2018.com/11/",                "http://www.dy2018.com/12/",                "http://www.dy2018.com/13/",                "http://www.dy2018.com/14/",                "http://www.dy2018.com/15/",                "http://www.dy2018.com/16/",                "http://www.dy2018.com/17/",                "http://www.dy2018.com/18/",                "http://www.dy2018.com/19/",                "http://www.dy2018.com/20/");                        $i = 0;                while($i < count($crawl_urls))                 {                    $pid = pcntl_fork();                    if($pid == -1) {                        echo "system error. check it now!";                        exit();                    } else if($pid > 0){                        $crawlers_pid[$i] = $pid;                    } else {                        $url = $crawl_urls[$i];                        $con = mysql_connect("localhost", "root", "123456");                        if(!$con) {                            die('Count not connect: '.mysql_error());                        }                        mysql_select_db("mysql", $con);                        crawl_process($url);                        $finish_count++;                    }                    $i++;                }                        //pcntl_waitpid可能会导致信号监听失败                while (true) {                     if($finish_count == count($crawlers_pid)) {                        echo "---------- crawl task finish ----------";                        mysql_close();                        exit();                    }                    sleep(1);                }                    }                    //从入口链接到其下所有下载页链接抓取过程            function crawl_process($url)            {                echo "start handle url:".$url;                $page_idx = 1;                $valid_tag = true;                $info_url_pattern = '/\/i\/\d+.html/';                $ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用                while($valid_tag) {                    $page_url = get_page_index_url($url, $page_idx);                    printf("start crawl url:".$page_url."\n");                    $page_content = get_page_content($page_url);                    $valid_tag = is_valid_page($page_content);                    if($valid_tag) {                        $matches_urls = array();                        preg_match_all($info_url_pattern, $page_content, $matches_urls);                        $page_content = mb_convert_encoding($page_content, "UTF-8", "GBK");                        for($i=0; $i<count($matches_urls[0]); $i++) {                            $detail_url = 'http://www.dy2018.com'.$matches_urls[0][$i];                            $detail_page_content = get_page_content($detail_url);                            $detail_page_content = mb_convert_encoding($detail_page_content, "UTF-8", "GBK");                            preg_match_all($ftp_url_pattern, $detail_page_content, $ftp_urls);                            $ftp_links = array();                            for($j=0;$j<count($ftp_urls[0]); $j++) {                                        $ftp_links[$j] = $ftp_urls[0][$j];                            }                            $ftp_links_unique = array_values(array_unique($ftp_links));                                    foreach ($ftp_links_unique as $ftp_link) {                                mysql_query("insert into dy2018_url (url, status) values('$ftp_link','0')");                                // echo mysql_error();//打印mysql错误                            }                            sleep(1);                        }                    }                    $page_idx++;                }            }                    //获取页码对应的url链接            function get_page_index_url($url, $idx)             {                $idx_url = $url;                if($idx == 1) {                    $idx_url = $idx_url.'index.html';                } else if($idx > 1){                    $idx_url = $idx_url.'index_'.$idx.'.html';                }                return $idx_url;            }                    //根据页面内容判断链接是否有效            function is_valid_page($content)            {                return $content?true:false;            }            run_dy2018();            mysql_close();        ?>