本文主要是介绍那些10w+的公众号都在写什么?,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
出于好奇,那些10w+的公众号都写了些什么,于是我写了几个脚本爬取了各行业Top的公众号文章,进行了关键词统计。
抓取数据、分析用到了3中语言:Node.js,Java,Python。废话不多说,直接上代码。
1(NODEJS)
puppeteer模拟登陆,抓取微信公众号链接:
/**
* load wechat article urls on newrank.cn
**/
const puppeteer = require('puppeteer');
//emulate iphone
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';
const workPath = './newrank_cn1111';
const fs = require("fs");
const userName = "公众号";
const ppwwdd = "caiyongji";
if (!fs.existsSync(workPath)) {fs.mkdirSync(workPath)
}
const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/';const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data";const detailUrl = "https://www.newrank.cn/public/info/detail.html?account=";(async () => {const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UIconst page = await browser.newPage();await page.setUserAgent(userAgent);await page.setViewport({width:1920, height:1000});await page.setRequestInterception(true);//filter to block imagespage.on('request', request => {if (request.resourceType() === 'image')request.abort();elserequest.continue();});await page.goto(loginUrl);//loginawait loginOperate();//await page.close();await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]');await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]');await processMonthlyRank('#wx_month_all');async function loginOperate(){try{await page.click('div[data-type=pwd]');}catch(err){console.log('login#1');}try{await page.type('#account_input',userName);await page.type('#password_input',ppwwdd);}catch(err){console.log('login#2');}try{await page.click('#pwd_confirm');}catch(err){console.log('login#3');}}async function processMonthlyRank(btn){const tab = await browser.newPage();await tab.setUserAgent(userAgent);await tab.setViewport({width:1920, height:1000});await tab.setRequestInterception(true);//filter to block imagestab.on('request', request => {if (request.resourceType() === 'image')request.abort();elserequest.continue();});await tab.goto(monthlyRankUrl);try{await tab.click(btn);}catch(err){console.log('processMonthlyRank#1');}let fileName = await tab.evaluate(function(param){return document.querySelector(param).innerHTML;},btn);console.log('-------------------------'+fileName+'-------------------------');await scrollWait(tab);await waitSecond(tab);const sel = '.wx_main tr';const texts = await tab.evaluate((sel) => {let elements = Array.from(document.querySelectorAll(sel));let txt = elements.map(element => {return element.innerText})return txt;}, sel);console.log('total rows: '+texts.length);let contents='记录条数'+(texts.length-1)+'\n\n';texts.forEach(function(c,index){if(index>0){contents+=c+'\n\n';}});const fs = require("fs");fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);console.log(fileName + " has been extracted to local.");const idSel = '.wx_main tr a[href^="detail.html"]';const ids = await tab.evaluate((idSel) => {let elements = Array.from(document.querySelectorAll(idSel));let txt = elements.map(element => {return element.innerText})return txt;}, idSel);let idContents='';let w_name;let flag =true;/*ids.forEach(async function(id,index){if(index%2!=0){idContents+=id+'\n';await getDetail(fileName,w_name,id);w_name =null;}else{w_name=id;}});*/await (async ()=>{for(let i=0;i<ids.length;i++){if(i%2!=0){idContents+=ids[i]+'\n';await getDetail(fileName,w_name,ids[i]);w_name =null;}else{w_name=ids[i];}}})();let idFile = 'id_'+fileName;fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);console.log(idFile + " has been extracted to local.");await tab.close();}async function scrollWait(p, n){if(n==null) n=5;for(let i= 0; i<n;i++){try{await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});}catch(err){console.log('scroll to bottom and then wait 500 ms.');}}}async function waitSecond(p){try{await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});}catch(err){//console.log('wait 1 sec.');}}async function getDetail(cat,name,id){const tab = await browser.newPage();await tab.setUserAgent(userAgent);await tab.setViewport({width:1920, height:1000});await tab.setRequestInterception(true);//filter to block imagestab.on('request', request => {if (request.resourceType() === 'image')request.abort();elserequest.continue();});await tab.goto(detailUrl+id);await waitSecond(tab);const sel = '#info_detail_article_top li .title a';const hrefs = await tab.evaluate((sel) => {let elements = Array.from(document.querySelectorAll(sel));let links = elements.map(element => {return element.href})return links;}, sel);let urlList='';hrefs.forEach(function(href,index){urlList+=href+"\n";});const fs = require("fs");if (!fs.existsSync(workPath+'/'+cat)) {fs.mkdirSync(workPath+'/'+cat)}fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList);const sel1 = '#info_detail_article_lastest li .title a';const hrefs1 = await tab.evaluate((sel1) => {let elements = Array.from(document.querySelectorAll(sel1));let links = elements.map(element => {return element.href})return links;}, sel1);let urlList1='';hrefs1.forEach(function(href,index){urlList1+=href+"\n";});fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);console.log(id+' '+name+' has been extracted to local.');await tab.close();}})();
2(JAVA)
Jsoup抓取微信文章文本:
package com;import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;public class WeChatUrls extends Thread {private File catFile;final static Integer ThreadNum = 1;final String ERROR = "ERROR";private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn";public WeChatUrls(File cat) {this.catFile = cat;}private String getUrlProxyContent(String url) {String body = ERROR;try {Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();if (doc.select("body") != null) {body = doc.select("body").text();}} catch (IOException e) {System.out.println("ERROR URL: " + url);e.printStackTrace();}return body;}private void write(String content, String fileName) {File f = new File(fileName);FileWriter fw = null;BufferedWriter bw = null;try {if (!f.exists()) {f.getParentFile().mkdirs();f.createNewFile();}
// fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加bw = new BufferedWriter(fw);bw.write(content);bw.close();} catch (Exception e) {e.printStackTrace();}}public static void main(String[] args) throws Exception {File baseFolder = new File(READ_URLS_FOLDER);File[] cataFiles = baseFolder.listFiles();ExecutorService service = Executors.newFixedThreadPool(ThreadNum);Arrays.asList(cataFiles).stream().forEach(catFile -> {if (catFile.isFile() && catFile.getName().startsWith("id")) {service.execute(new WeChatUrls(catFile));}});service.shutdown();}private void process() {
// Set<String> redoSet = new HashSet<>();String catagory = catFile.getName().split("\\.")[0].split("_")[1];File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);File[] urlFiles = urlFolder.listFiles();if (urlFiles != null) {Arrays.asList(urlFiles).stream().forEach(urlFile -> {try {BufferedReader reader = new BufferedReader(new FileReader(catFile));String wechatId = null;int countLatest = 1;int countTop = 1;while ((wechatId = reader.readLine()) != null) {if (urlFile.getName().startsWith(wechatId)) {String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];
// if (urlFile.length() == 0) {
// redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");
// }BufferedReader r = new BufferedReader(new FileReader(urlFile));String wechatUrl = null;while ((wechatUrl = r.readLine()) != null) {String writePath = WORK_FOLDER + "\\" + catagory + "\\"+ (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId+ "_" + wechatName + "_"+ (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";String content = getUrlProxyContent(wechatUrl);write(content, writePath);System.out.println(writePath);Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));}r.close();}}reader.close();} catch (Exception e) {e.printStackTrace();}});}
// redoSet.stream().forEach(System.out::println);}@Overridepublic void run() {process();}
}
3(PYTHON)
wordcloud生成词云:
# -*- coding: utf-8 -*-
import json
import random
import time
import os
from pyecharts import Bar,Geo,Line,Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter
os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles')stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序']def proc(folder, type):fileLines = []rootdir = './'+folder+'/'+typelist = os.listdir(rootdir)for i in range(0,len(list)):path = os.path.join(rootdir,list[i])if os.path.isfile(path):try:fo = open(path, 'r+')fileLines += fo.readlines()except:print('error while processing file: ' + path)_str = ' '.join(fileLines)words_list = []word_generator = jieba.cut_for_search(_str) for word in word_generator:words_list.append(word)words_list = [k for k in words_list if len(k)>1 and k not in stopWords]back_color = imread('back.jpg')wc = WordCloud(background_color='white',max_words=2000,mask=back_color,max_font_size=300,font_path="C:/Windows/Fonts/msyh.ttc",random_state=42)_count = Counter(words_list)wc.generate_from_frequencies(_count)image_colors = ImageColorGenerator(back_color)wc.recolor(color_func=image_colors)#plt.figure()#plt.imshow(wc.recolor(color_func=image_colors))#plt.axis('off')# The pil way (if you don't have matplotlib)image = wc.to_image()image.show()jpgFile = './'+type+'_'+folder+'.jpg'image.save(jpgFile)print('image File saved:' + jpgFile)basedir = './'
baselist = os.listdir(basedir)
for l in range(0,len(baselist)):p = os.path.join(basedir,baselist[l])if os.path.isdir(p):proc(os.path.basename(p), 'top')
4
词云结果涉及23个维度,得出结果如下:
TOP500公众号文章
创业
健康
教育
乐活
企业
情感
体育娱乐
文化
文摘
幽默
政务
旅行
时事
时尚
民生
汽车
百科
科技
美体
美食
职场
财富
5
数据集已开源。
关注公众号 caiyongji 回复 10w_article。获取代码以及数据
或github:https://github.com/caiyongji/wechat-ranking
这篇关于那些10w+的公众号都在写什么?的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!