本文主要是介绍python采集百度新闻源并自动发布文章到phpcms,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
通过对phpcms数据库字段的填充,实现自动发布文章,手动发布一篇文章并查看数据库中那些table发生变化,即可发现cms(如帝国cms等)文章自动化发布工具开发的突破口!
# coding=utf-8 '''功能:采集百度新闻(http://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。 主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次 ''' import pycurl,StringIO,json,urllib,urllib2,re import MySQLdb import time from warnings import filterwarnings import MySQLdb as Database filterwarnings('ignore', category = Database.Warning) import sys reload(sys) sys.setdefaultencoding('utf8') headers = [ "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36", "Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226", ] def curl(url): c = pycurl.Curl() #通过curl方法构造一个对象 #c.setopt(pycurl.REFERER, 'http://qy.m.58.com/') #设置referer c.setopt(pycurl.FOLLOWLOCATION, True) #自动进行跳转抓取 c.setopt(pycurl.MAXREDIRS,5) #设置最多跳转多少次 c.setopt(pycurl.CONNECTTIMEOUT, 60) #设置链接超时 c.setopt(pycurl.TIMEOUT,120) #下载超时 c.setopt(pycurl.ENCODING, 'gzip,deflate') # c.setopt(c.PROXY,ip) # 代理 c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) #设置要访问的URL c.setopt(pycurl.HTTPHEADER,headers) #传入请求头 # c.setopt(pycurl.POST, 1) # c.setopt(pycurl.POSTFIELDS, data) #传入POST数据 c.setopt(c.WRITEFUNCTION, c.fp.write) #回调写入字符串缓存 c.perform() code = c.getinfo(c.HTTP_CODE) #返回状态码 html = c.fp.getvalue() #返回源代码 return html # 通过正则提取元素 def search(req,html): text = re.search(req,html) if text: data = text.group(1) else: data = 'no' return data # 去除文章url、多余标签等、补全路径等 def content_sort(content): content = re.sub('<p.*?>','<p>',content,flags=re.I) content = re.sub('<P.*?>','<p>',content) content = re.sub('</?span.*?>','',content) content = re.sub('</?a.*?>','',content) content = re.sub('<!.*?>','',content) content = re.sub('</?img.*?>','',content,re.IGNORECASE) content = re.sub('</?IMG.*?>','',content,re.IGNORECASE) content = re.sub('</?div.*?>','',content,flags=re.I) content = re.sub('</?DIV.*?>','',content) content = re.sub('</?iframe.*?>','',content) content = re.sub('</?center.*?>','',content) content = re.sub('</?[fF].*?>','',content) content = re.sub('<script.*?>[\s\S]*?</script>','',content) content = re.sub('</?strong.*?>','',content) content = re.sub('<INPUT.*?>','',content) content = re.sub('<style.*?>[\s\S]*?</style>','',content) content = re.sub(' ','',content) content = re.sub(' ','',content) content = re.sub(' ','',content) return content #域名与正则、编码对应表 req_dict = { 'finance.sina.com.cn': {'title':'<h1.*?>(.*?)</h1>','content':'<!-- 原始正文start -->([\s\S]*?)<!-- 原始正文end -->','decode':'utf-8'}, 'stock.eastmoney.com': {'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'}, 'finance.eastmoney.com': {'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'},#ok 'guba.eastmoney.com': {'title':'<title>(.*?)_.*?</title>','content':'<div id="zwconbody">([\s\S]*?)<div class="zwconbtns clearfix">','decode':'utf-8'},#ok 'stock.jrj.com.cn': {'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'}, 'hk.jrj.com.cn': {'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'}, 'hkstock.cnfol.com': {'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok 'sc.stock.cnfol.com': {'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok 'money.163.com': {'title':'<title>(.*?)_.*?</title>','content':'<div class="post_text".*?">([\s\S]*?)<!--.*?s -->','decode':'utf-8'}, 'www.chinastock.com.cn': {'title':'<div class="d_title">([\s\S]*?)</div>','content':'<div class="d_content" id="Zoom">([\s\S]*?)<div class="dleft_new_attachment">','decode':'utf-8'}, 'stock.huagu.com': {'title':'<h1 id="h1-title">([\s\S]*?)</h1>','content':'<div class="article_con" id="div-article-content">([\s\S]*?)<div class="clear"></div>','decode':'utf-8'}, 'stock.sohu.com': {'title':'<h1 itemprop="headline">([\s\S]*?)</h1>','content':'<div itemprop="articleBody">([\s\S]*?)<div class="original-title"','decode':'gbk'}, 'stock.cngold.org': {'title':'<title>(.*?)-.*?</title>','content':'<div class="det_content" id="zoom">([\s\S]*?)<div class="listPage">','decode':'utf-8'}, 'hk.stock.hexun.com': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'utf-8'}, 'stock.gucheng.com': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="content">([\s\S]*?)</div>','decode':'utf-8'}, 'www.cnstock.com': {'title':'<title>(.*?)-.*?</title>','content':'<div class="content-inner" id="qmt_content_div">([\s\S]*?)</div>','decode':'gbk'}, 'www.ccstock.cn': {'title':'<title>(.*?)-.*?</title>','content':'<div id="newscontent">([\s\S]*?)</div>','decode':'utf-8'}, 'news.emoney.cn': {'title':'<title>(.*?)-.*?</title>','content':'<div class="RL_details_content">([\s\S]*?)<div class="PageNav">','decode':'utf-8'}, 'finance.ce.cn': {'title':'<title>(.*?)</title>','content':'<div class=TRS_Editor>([\s\S]*?)<textarea id="allinfo"','decode':'gbk'}, 'www.p5w.net': {'title':'<title>(.*?)[_-|].*?</title>','content':'<div class="text">([\s\S]*?)<div class="pages">','decode':'gbk'}, 'www.nbd.com.cn': {'title':'<title>(.*?)[_-|][\s\S]*?</title>','content':'<div class="main-left-article">([\s\S]*?)<div style="overflow:','decode':'utf-8'}, 'stock.hexun.com': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'gbk'}, 'stock.caijing.com.cn': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div id="the_content".*?>([\s\S]*?)<div class="ar_writer"','decode':'utf-8'}, } def id(): '''获取标题对应id,构建url.我用的是phpcms,前台显示需将url写入数据库''' con = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8') with con: cur = con.cursor() cur.execute("select id from v9_news where title = title") numrows = int(cur.rowcount) return numrows+1 def CmsSQL(title,content): '''写入数据,如何将多个数据写入数据库可参考''' value1 = [] value1.append(content) value1.append(idnum) value2 = [] value2.append(title) value2.append(urlid) value2.append(int(time.time())) value2.append(int(time.time())) db = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8') cursor = db.cursor() cursor.execute("insert into v9_news_data (content,id) values(%s,%s)" ,value1) cursor.execute("insert into v9_news(title,catid,typeid,url,inputtime,updatetime) values(%s,6,0,%s,%s,%s)",value2) db.commit() db.close() url = 'http://news.baidu.com/n?cmd=4&class=gegu&tn=rss' urls = re.findall(r'<link><!\[CDATA\[(.*?)\]\]></link>',curl(url)) urls.reverse() for url in urls: with open('urls.txt') as f1 : if url not in f1.read(): #判断url是否采集过 url.strip() f1.close() line = url.split('/')[2] if req_dict.has_key(line): #通过键位是否存在判断这个网站是否写好的正则 time.sleep(1) try: title = search(req_dict[line]['title'],curl(url)).decode(req_dict[line]['decode']) #网址与正则及网页编码对应起来 content = url + search(req_dict[line]['content'],curl(url)).decode(req_dict[line]['decode']) except: continue urlid = 'http://localhost/index.php?m=content&c=index&a=show&catid=6&id=%s' %id() idnum = int(id()) print id(),content_sort(title) CmsSQL(content_sort(title),content_sort(content)) f1w =open('urls.txt','a+') f1w.write(url+'\n') f1w.close() else: print u'正则不存在' open('requrl','a+').write(url+'\n') else: print u'此url在列表中:'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | # coding=utf-8 '''功能:采集百度新闻(http://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。 主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次 ''' import pycurl , StringIO , json , urllib , urllib2 , re import MySQLdb import time from warnings import filterwarnings import MySQLdb as Database filterwarnings ( 'ignore' , category = Database . Warning ) import sys reload ( sys ) sys . setdefaultencoding ( 'utf8' ) headers = [ "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36" , "Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226" , ] def curl ( url ) : c = pycurl . Curl ( ) #通过curl方法构造一个对象 #c.setopt(pycurl.REFERER, 'http://qy.m.58.com/') #设置referer c . setopt ( pycurl . FOLLOWLOCATION , True ) #自动进行跳转抓取 c . setopt ( pycurl . MAXREDIRS , 5 ) #设置最多跳转多少次 c . setopt ( pycurl . CONNECTTIMEOUT , 60 ) #设置链接超时 c . setopt ( pycurl . TIMEOUT , 120 ) #下载超时 c . setopt ( pycurl . ENCODING , 'gzip,deflate' ) # c.setopt(c.PROXY,ip) # 代理 c . fp = StringIO . StringIO ( ) c . setopt ( pycurl . URL , url ) #设置要访问的URL c . setopt ( pycurl . HTTPHEADER , headers ) #传入请求头 # c.setopt(pycurl.POST, 1) # c.setopt(pycurl.POSTFIELDS, data) #传入POST数据 c . setopt ( c . WRITEFUNCTION , c . fp . write ) #回调写入字符串缓存 c . perform ( ) code = c . getinfo ( c . HTTP_CODE ) #返回状态码 html = c . fp . getvalue ( ) #返回源代码 return html # 通过正则提取元素 def search ( req , html ) : text = re . search ( req , html ) if text : data = text . group ( 1 ) else : data = 'no' return data # 去除文章url、多余标签等、补全路径等 def content_sort ( content ) : content = re . sub ( '<p.*?>' , '<p>' , content , flags = re . I ) content = re . sub ( '<P.*?>' , '<p>' , content ) content = re . sub ( '</?span.*?>' , '' , content ) content = re . sub ( '</?a.*?>' , '' , content ) content = re . sub ( '<!.*?>' , '' , content ) content = re . sub ( '</?img.*?>' , '' , content , re . IGNORECASE ) content = re . sub ( '</?IMG.*?>' , '' , content , re . IGNORECASE ) content = re . sub ( '</?div.*?>' , '' , content , flags = re . I ) content = re . sub ( '</?DIV.*?>' , '' , content ) content = re . sub ( '</?iframe.*?>' , '' , content ) content = re . sub ( '</?center.*?>' , '' , content ) content = re . sub ( '</?[fF].*?>' , '' , content ) content = re . sub ( '<script.*?>[\s\S]*?</script>' , '' , content ) content = re . sub ( '</?strong.*?>' , '' , content ) content = re . sub ( '<INPUT.*?>' , '' , content ) content = re . sub ( '<style.*?>[\s\S]*?</style>' , '' , content ) content = re . sub ( ' ' , '' , content ) content = re . sub ( ' ' , '' , content ) content = re . sub ( ' ' , '' , content ) return content #域名与正则、编码对应表 req_dict = { 'finance.sina.com.cn' : { 'title' : '<h1.*?>(.*?)</h1>' , 'content' : '<!-- 原始正文start -->([\s\S]*?)<!-- 原始正文end -->' , 'decode' : 'utf-8' } , 'stock.eastmoney.com' : { 'title' : '<h1.*?>(.*?)</h1>' , 'content' : '<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">' , 'decode' : 'gbk' } , 'finance.eastmoney.com' : { 'title' : '<h1.*?>(.*?)</h1>' , 'content' : '<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">' , 'decode' : 'gbk' } , #ok 'guba.eastmoney.com' : { 'title' : '<title>(.*?)_.*?</title>' , 'content' : '<div id="zwconbody">([\s\S]*?)<div class="zwconbtns clearfix">' , 'decode' : 'utf-8' } , #ok 'stock.jrj.com.cn' : { 'title' : '<title>(.*?)-' , 'content' : '<div class="texttit_m1">([\s\S]*?)<div id="itougu">' , 'decode' : 'gbk' } , 'hk.jrj.com.cn' : { 'title' : '<title>(.*?)-' , 'content' : '<div class="texttit_m1">([\s\S]*?)<div id="itougu">' , 'decode' : 'gbk' } , 'hkstock.cnfol.com' : { 'title' : '<title>(.*?)_.*?</title>' , 'content' : '<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->' , 'decode' : 'utf-8' } , #ok 'sc.stock.cnfol.com' : { 'title' : '<title>(.*?)_.*?</title>' , 'content' : '<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->' , 'decode' : 'utf-8' } , #ok 'money.163.com' : { 'title' : '<title>(.*?)_.*?</title>' , 'content' : '<div class="post_text".*?">([\s\S]*?)<!--.*?s -->' , 'decode' : 'utf-8' } , 'www.chinastock.com.cn' : { 'title' : '<div class="d_title">([\s\S]*?)</div>' , 'content' : '<div class="d_content" id="Zoom">([\s\S]*?)<div class="dleft_new_attachment">' , 'decode' : 'utf-8' } , 'stock.huagu.com' : { 'title' : '<h1 id="h1-title">([\s\S]*?)</h1>' , 'content' : '<div class="article_con" id="div-article-content">([\s\S]*?)<div class="clear"></div>' , 'decode' : 'utf-8' } , 'stock.sohu.com' : { 'title' : '<h1 itemprop="headline">([\s\S]*?)</h1>' , 'content' : '<div itemprop="articleBody">([\s\S]*?)<div class="original-title"' , 'decode' : 'gbk' } , 'stock.cngold.org' : { 'title' : '<title>(.*?)-.*?</title>' , 'content' : '<div class="det_content" id="zoom">([\s\S]*?)<div class="listPage">' , 'decode' : 'utf-8' } , 'hk.stock.hexun.com' : { 'title' : '<title>(.*?)[-_|].*?</title>' , 'content' : '<div class="art_contextBox">([\s\S]*?)<div class="showAll">' , 'decode' : 'utf-8' } , 'stock.gucheng.com' : { 'title' : '<title>(.*?)[-_|].*?</title>' , 'content' : '<div class="content">([\s\S]*?)</div>' , 'decode' : 'utf-8' } , 'www.cnstock.com' : { 'title' : '<title>(.*?)-.*?</title>' , 'content' : '<div class="content-inner" id="qmt_content_div">([\s\S]*?)</div>' , 'decode' : 'gbk' } , 'www.ccstock.cn' : { 'title' : '<title>(.*?)-.*?</title>' , 'content' : '<div id="newscontent">([\s\S]*?)</div>' , 'decode' : 'utf-8' } , 'news.emoney.cn' : { 'title' : '<title>(.*?)-.*?</title>' , 'content' : '<div class="RL_details_content">([\s\S]*?)<div class="PageNav">' , 'decode' : 'utf-8' } , 'finance.ce.cn' : { 'title' : '<title>(.*?)</title>' , 'content' : '<div class=TRS_Editor>([\s\S]*?)<textarea id="allinfo"' , 'decode' : 'gbk' } , 'www.p5w.net' : { 'title' : '<title>(.*?)[_-|].*?</title>' , 'content' : '<div class="text">([\s\S]*?)<div class="pages">' , 'decode' : 'gbk' } , 'www.nbd.com.cn' : { 'title' : '<title>(.*?)[_-|][\s\S]*?</title>' , 'content' : '<div class="main-left-article">([\s\S]*?)<div style="overflow:' , 'decode' : 'utf-8' } , 'stock.hexun.com' : { 'title' : '<title>(.*?)[-_|].*?</title>' , 'content' : '<div class="art_contextBox">([\s\S]*?)<div class="showAll">' , 'decode' : 'gbk' } , 'stock.caijing.com.cn' : { 'title' : '<title>(.*?)[-_|].*?</title>' , 'content' : '<div id="the_content".*?>([\s\S]*?)<div class="ar_writer"' , 'decode' : 'utf-8' } , } def id ( ) : '''获取标题对应id,构建url.我用的是phpcms,前台显示需将url写入数据库''' con = MySQLdb . connect ( 'localhost' , 'root' , '' , 'phpcmsv9' , charset = 'utf8' ) with con : cur = con . cursor ( ) cur . execute ( "select id from v9_news where title = title" ) numrows = int ( cur . rowcount ) return numrows + 1 def CmsSQL ( title , content ) : '''写入数据,如何将多个数据写入数据库可参考''' value1 = [ ] value1 . append ( content ) value1 . append ( idnum ) value2 = [ ] value2 . append ( title ) value2 . append ( urlid ) value2 . append ( int ( time . time ( ) ) ) value2 . append ( int ( time . time ( ) ) ) db = MySQLdb . connect ( 'localhost' , 'root' , '' , 'phpcmsv9' , charset = 'utf8' ) cursor = db . cursor ( ) cursor . execute ( "insert into v9_news_data (content,id) values(%s,%s)" , value1 ) cursor . execute ( "insert into v9_news(title,catid,typeid,url,inputtime,updatetime) values(%s,6,0,%s,%s,%s)" , value2 ) db . commit ( ) db . close ( ) url = 'http://news.baidu.com/n?cmd=4&class=gegu&tn=rss' urls = re . findall ( r '<link><!\[CDATA\[(.*?)\]\]></link>' , curl ( url ) ) urls . reverse ( ) for url in urls : with open ( 'urls.txt' ) as f1 : if url not in f1 . read ( ) : #判断url是否采集过 url . strip ( ) f1 . close ( ) line = url . split ( '/' ) [ 2 ] if req_dict . has_key ( line ) : #通过键位是否存在判断这个网站是否写好的正则 time . sleep ( 1 ) try : title = search ( req_dict [ line ] [ 'title' ] , curl ( url ) ) . decode ( req_dict [ line ] [ 'decode' ] ) #网址与正则及网页编码对应起来 content = url + search ( req_dict [ line ] [ 'content' ] , curl ( url ) ) . decode ( req_dict [ line ] [ 'decode' ] ) except : continue urlid = 'http://localhost/index.php?m=content&c=index&a=show&catid=6&id=%s' % id ( ) idnum = int ( id ( ) ) print id ( ) , content_sort ( title ) CmsSQL ( content_sort ( title ) , content_sort ( content ) ) f1w = open ( 'urls.txt' , 'a+' ) f1w . write ( url + '\n' ) f1w . close ( ) else : print u '正则不存在' open ( 'requrl' , 'a+' ) . write ( url + '\n' ) else : print u '此url在列表中:' |
** 本文转自:http://bigwayseo.com/2456
这篇关于python采集百度新闻源并自动发布文章到phpcms的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!