ch = curl_init($url); curl_set">
 | 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
ccpp
10年前发布

应用curl扩展抓取网页

    <?php        namespace Think;        header("Content-Type: text/html;charset=utf-8");        class Mycurl        {            public $ch = null;            public $data = null;                    public function __construct($url)            {                $this->ch = curl_init($url);                curl_setopt($this->ch, CURLOPT_HEADER, false);   //不返回头部信息                //将 curl_exec()获取的信息以文件流的形式返回,而不是直接输出。                curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);                 $this->data = curl_exec($this->ch);              }            public function __destruct()  //释放资源             {                  curl_close($this->ch);            }                    public function regmatch()   //正则方式抓取            {                $reg = '/(?<=<title>)(.*)(?=<\/title>)/i';  //抓取标题                $reg = '/<div\sid="article_content"\sclass="article_content">([^(?<\/div>)]*)<\/div>/si'; //抓取文章内容                preg_match($reg,$this->data,$out);                   return $out[1];            }             public function result($pos1,$pos2)   //字符串方式抓取            {                $len = strlen($pos1);                $flag1 = stripos($this->data, $pos1);                $flag2 = stripos($this->data, $pos2);                $str = substr($this->data,$flag1,$flag2-$flag1);                return $str;            }            public function exec()   //获取抓取数据            {                $data = Array();                $data['title'] = self::result('<title>','-卢松松博客</title>');                $data['title'] = substr($data['title'],7);  //参数7偏移是为了过滤上一步字符串抓取结果中的前面<title>                $data['content'] = self::result('<dd class="post-info">','<center>');                $data['content'] = str_ireplace("/upload/","http://lusongsong.com/upload/",$data['content']);  //这一步解决抓取文章的图片地址错误                $data['content'] = str_ireplace("http://lusongsong.comhttp://lusongsong.com","http://lusongsong.com",$data['content']); //解决上一步产生的副作用,                 $data['content'] = str_ireplace("bloghttp://lusongsong.com","blog",$data['content']);  //继续解决上两步产生的副作用                $data['atime'] = time();                $data['author'] = 'Internet';                $data['sort'] = '精彩博文';                // $data['oldlink'] = '';                $data['summary'] = substr(strip_tags($data['content']),0,180);  //截取文章摘要                return $data;            }                }                // 测试        $url = 'http://lusongsong.com/reed/';        $num = 100;   //住区文章数目        $start = 350;  //抓取起点                $Art = M('article');                for($i=$start; $i < $start+$num ; $i++)        {                     $posurl = $url.$i.'.html';            $curl = new Mycurl($posurl);            $data = $curl->exec();            $data['oldlink'] = $posurl;            if($pos = strpos($data['title'], "出现404错误页面了"))            {                continue;            }            $Art->add($data);            $curl = null;        }        $this->success("执行完成!","index");                                ?>