本文主要是介绍php脚本爬取头像图片,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
因为要插入系统用户,所以用php爬取百度图片上的头像url,再存储到本地,速度一般,1000张图片差不多花费半个多小时,不知道是不是因为 file_get_content 函数的缘故,或者是没有开多线程 php-fpm , 没有仔细研究优化,提高脚本速度可以从多线程和异步网络请求然后回调去解决。一下是代码
<?php
error_reporting(E_ALL ^ E_NOTICE);
set_time_limit(0);$pageNum = 3;
//获取数据
for ($page=0; $page < $pageNum; $page++) {//翻页数据$pnArr = array('0','30','60','90','120','150','180','210','240','270','300','330','360'); $rand = rand(0,5);//百度头像图片获取api$urlArr = array("http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%B5%B7%E8%BE%B9&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%B5%B7%E8%BE%B9&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476431870063=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E5%8D%8A%E8%BA%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E5%8D%8A%E8%BA%AB&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476431965788=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E8%90%9D%E8%8E%89&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E8%90%9D%E8%8E%89&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476432025419=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E7%94%B7%E7%94%9F&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E7%94%B7%E7%94%9F&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=5a&1476432073843=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E7%BB%8F%E5%85%B8%E5%8A%A8%E4%BD%9C+%E5%98%9F%E5%98%B4&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E7%BB%8F%E5%85%B8%E5%8A%A8%E4%BD%9C+%E5%98%9F%E5%98%B4&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476432128601=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%A4%E7%8B%AC&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%A4%E7%8B%AC&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476432211175=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+90%E5%90%8E&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+90%E5%90%8E&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476432252392=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%A3%AE%E7%B3%BB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%A3%AE%E7%B3%BB&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476432318321=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%97%E6%AF%8D&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%97%E6%AF%8D&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=5a&1476432384197=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E5%8D%A1%E9%80%9A%E5%8A%A8%E6%BC%AB+%E7%BE%8E%E5%B0%91%E5%A5%B3%E6%88%98%E5%A3%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E5%8D%A1%E9%80%9A%E5%8A%A8%E6%BC%AB+%E7%BE%8E%E5%B0%91%E5%A5%B3%E6%88%98%E5%A3%AB&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=78&1476432429562=","http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%80%A7%E6%84%9F&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%80%A7%E6%84%9F&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476432470204=");$index = array_rand($urlArr,1);$imgJsonData = file_get_contents($urlArr[$index]);//随机采样,让每次获取的图片尽量不是相同的,可以增加获取图片api去减少获取重复的图片的几率$imgJsonData = json_decode($imgJsonData,true);foreach ($imgJsonData['data'] as $key => $dataArr) {if ($dataArr['thumbURL']) {putImgToLocal($dataArr['thumbURL']);}}
}function putImgToLocal($url) {//下载本地if (!is_dir('./userSysAvatorUrl')) {mkdir('./userSysAvatorUrl', 0777, true);}$hdrs = array('http' =>array('header' => "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n" ."Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3\r\n" ."Connection: keep-alive\r\n" ."Host: img0.imgtn.bdimg.com\r\n" ."Referer: http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%CD%B7%CF%F1&fr=ala&oriquery=%E5%A4%B4%E5%83%8F&ala=1&alatpl=portait&pos=0\r\n" ."User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0\r\n" ."X-Requested-With: XMLHttpRequest",),);$context = stream_context_create($hdrs);list($msec, $sec) = explode(' ', microtime());$file_name = (float)sprintf('%.0f', (floatval($msec) + floatval($sec)) * 1000);$shullf = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z');$rand_key = array_rand($shullf,3);$file_name = $shullf[$rand_key[0]].$shullf[$rand_key[1]].$shullf[$rand_key[2]].'_'.$file_name. '.jpg';$imgBin = file_get_contents($url, 0, $context);file_put_contents('./userSysAvatorUrl/'.$file_name, $imgBin);
}
这篇关于php脚本爬取头像图片的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!