本文主要是介绍51JOB网站爬虫,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
关于51job网站的爬虫
- 一、原由
- 二、查看51job网站内容
- 三、部分代码展示
- 四、效果展示
一、原由
学校里有要求每日收集相应就业信息内容
二、查看51job网站内容
三、部分代码展示
def create_excel(excel_data):localtime = time.localtime(time.time())name = str(localtime.tm_mon) + str(localtime.tm_mday) + college + ".xlsx"app = xw.App(visible=True, add_book=False)# 新建工作簿wb = app.books.add()# 存储excel# wb.save('example.xlsx')# 引用工作表sht = wb.sheets['sheet1']sht.range('A1').options(expand='table').value = excel_dataprint(sht.range('A1').value)wb.save(name)# 退出工作簿wb.close()# 推出excelapp.quit()returndef write_excel(excels):for excel in excels:for word in excel:print(word)return# 上海 软件工程
def get_html(job_pos, search):target_url = get_url(job_pos, 3, urllib.parse.quote(search))print(target_url)response = request.urlopen(target_url)html_doc = response.read().decode('gbk')soup = BeautifulSoup(html_doc, 'html.parser')div_test = soup.find_all("script")# div_test = div_test.find_all("script")# print(div_test)ans = re.findall(r"window.__SEARCH_RESULT__\s=\s({.*})", div_test.__str__())ans = re.findall(r"job_href\":\"(https:[^\"]*t=0)", ans[0])for an in ans:job_list.append(str(an).replace("\\", ""))return job_list# 收集职业就业网页信息
def get_information(aim_websites):i = 1job_information_list.append(type_table)for website in aim_websites:job_information = []if i == 11:break# "https://jobs.51job.com/shanghai-ypq/125300004.html?s=01&t=0"response = request.urlopen(website)html_doc = response.read().decode('gbk')# print(html_doc)soup = BeautifulSoup(html_doc, 'html.parser')# 输出公司名字company_name = soup.find(class_="com_msg").p['title']# 输出公司类型company_type = soup.find(class_="com_tag").p['title']# 输出招聘信息cn = soup.find(class_="cn")job_name = cn.h1['title']stuff_info = soup.find(class_="cn").find(class_="msg ltype")['title']ans = stuff_info.split("|")# 生成相应内容detail = []aim_create.append(type_table)for an in ans:detail.append(str(an).replace(u'\xa0', u''))print('--------------------' + str(i) + '----------------------')# 序号# print("序号:")job_information.append(i)i += 1# 类别# print("类别:")job_information.append(college)# 创建时间# print("创建时间:")localtime = time.localtime(time.time())name = str(localtime.tm_year) + '/' + str(localtime.tm_mon) + '/' + str(localtime.tm_mday)job_information.append(name)# 过期时间# print("过期时间:")job_information.append('')# 工作地点# print("工作地点:" + job_list[1])job_information.append(detail[0])# 公司名称# print("公司名称:" + company_name)job_information.append(company_name)# 链接地址# print("链接地址:")job_information.append(website)# 职务名称# print("职务名称:" + job_name)job_information.append(job_name)# 是否推荐job_information.append('')# 公司性质job_information.append(company_type)# 职务性质job_information.append('实习')# 教育背景# print("教育背景" + job_list[2])job_information.append(detail[2])# 信用良好# print("信用良好:" + '1')job_information.append('1')# 不良信用# print("不良信用")job_information.append('')# 500强# print("500强")job_information.append('')# 上市# print("上市")job_information.append('')# 200人以上# print("200人以上")job_information.append('1')# 200人以下# print("200人以下")job_information.append('')# 岗位数# print("岗位数:" + '1')job_information.append('1')# 需求人数# print("需求数:" + job_list[3])job_information.append(detail[3])job_information_list.append(job_information.copy())return job_information_listdef get_pos():return# 生成搜索接口
def get_url(pos_name, num, search):postion = pos[pos_name]num = company_size[3]target = url + "/list/"+postion+",000000,0000,00,9,99,"+urllib.parse.quote(search)+",2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize="+num+"&ord_field=0&dibiaoid=0&line=&welfare="return targetdef turn_page(source):ans_list = []for i in range(1, 10):ans = re.sub(r',(\d*).html', "," + str(i) + ".html", source)ans_list.append(ans)return ans_listdef position(chars):string = ""for char in chars:string = char + ""return stringdef to_unicode(string):ret = ''for v in string:ret = ret + hex(ord(v)).upper().replace('0X', '\\u')return ret
四、效果展示
部分内容有误 需求人数/教育背景/发布时间有误,由于本身位置有误(有待改进)
这篇关于51JOB网站爬虫的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!