Python爬取某度文库搜索文章

时间:2019-11-08 10:30:01 来源:淮南热线 当前位置:哲想ag8亚游|官方网站 > 饮食 > 手机阅读

点击上方“极客程序”,选择关注公众号

阅读各种开源代码解读文章!


备注:以下代码只为研究技术分析,相关下载请于24小时内删除

?

Python是一门非常简单的语言,快速入门之后可以做很多事情。另外Python又名爬虫,就是因为其在网页抓取的上的快速功效,今天这个例子就是不到百行代码实现从某度文库抓取文章

从某度文库爬取相关的文章,其中大致分为两个步骤

1,给定关键字,搜索某度文库,并按页解析出要下载的文档url

2,批量下载url下的文档

?

网上关于Python爬取某度搜索结果的代码还是挺多的,但是没找到爬取某度文库的,对比某度搜索和某度文库的url,两者差异并不大,只是部分关键字,某度搜索用的缩写

?

对比了下看看里面的几个区别,差别不是太大,主要是某度搜索用的是缩写,所以只要把代码略微修改即可,这里就不重复贴代码了,文章的末尾我会把全部代码贴出来

?

某度搜索结果url

http://www.baidu.com/s?wd=%E5%8D%97%E4%BA%AC+%E5%8E%86%E5%B9%B4+%E8%8B%B1%E8%AF%AD+%E7%9C%9F%E9%A2%98&pn=0&cl=3&rn=100

某度文库搜索结果url

https://wenku.baidu.com/search?word=%C4%CF%BE%A9+%C0%FA%C4%EA+%D3%A2%D3%EF+%D5%E6%CC%E2&org=0&fd=0&lm=0&od=0&pn=20

url的几个区别

1、http->https

2、wd->word

3、s->search

?

找到搜索结果url后,后面翻页,两者的逻辑是一致的,都是用pn=?进行页面偏移,所以只要解析url的结果获取对应的文章地址,保存下来,第一步就算是完成了

?

第二步其实就是遍历拿到文档url,依次下载,只是需要url请求的时候需要模拟手机,因为电脑端会因为翻页问题导致无法完整下载

大概步骤就是这样,下面就直接上代码了

?

第一步骤代码

1、???????根据关键字拼出来搜索结果url

2、???????通过https访问某度文库获取搜索结果

3、???????解析搜索结果,将文库的文档地址,保存到一个txt中

# coding=utf8

import urllib2

import string

import traceback

import urllib

import re

import chardet

import random

from utils.FileUtils import writeContentStr2File, writeAppend2File

from utils.word.Alphabet import cleanWord

#设置多个user_agents,防止某度限制IP

user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) ?Gecko/20130406 Firefox/23.0', \

??? 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 ?Firefox/18.0', \

??? 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) ?AppleWebKit/533+ \

??? (KHTML, like Gecko) Element ?Browser 5.0', \

??? 'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \

??? 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \

??? 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \

??? 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) ?AppleWebKit/536.26 (KHTML, like Gecko) \

??? Version/6.0 Mobile/10A5355d Safari/8536.25', \

??? 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) ?\

??? Chrome/28.0.1468.0 Safari/537.36', \

??? 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; ?TheWorld)']

?

?

def baidu_search(keyword,pn):

#???? p= {'wd': keyword}

??? p= {'word': keyword}

#???? url = ?("http://www.baidu.com/s?"+urllib.urlencode(p)+"&pn={0}&cl=3&rn=100").format(pn)

??? url = ("https://wenku.baidu.com/search?"+urllib.urlencode(p)+"&pn={0}&cl=3&rn=100").format(pn)

??? print url

??? res=urllib2.urlopen(url??????????????????????? )

??? html=res.read()

#???? print html

??? return html

?

def getList(regex,text):

??? arr = []

??? res = re.findall(regex, text)

??? if res:

??????? for r in res:

??????????? arr.append(r)

??? return arr

?

?

def getMatch(regex,text):

??? res = re.findall(regex, text)

??? if res:

??????? return res[0]

??? return""

?

?

def clearTag(text):

??? p = re.compile(u'<[^>]+>')

??? retval = p.sub("",text)

??? return retval

?

def downloadurl(url):

??? domain=urllib2.Request(url)

??? r=random.randint(0,9)

??? domain.add_header('User-agent', user_agents[r])

??? domain.add_header('connection','keep-alive')

??? response=urllib2.urlopen(domain)???

??? data = response.read()

??? return data

?

def geturl(keyword):

??? for page in range(10):

?????? ?pn=page*10

??????? html = baidu_search(keyword,pn)

??????? content = html

??????? writeContentStr2File(content, "baidu.result.html", "GBK")

??????? postfix = u"https://wenku.baidu.com/view"

??????? arrList = getList(u"href=\""+ postfix + u"(.*?)\"", content)

??????? for item in arrList:

??????????? url = postfix + item

??????????? url = url.replace("?from=search","")

??????????? print url

??????????? writeAppend2File(url+'\n', "wenku.doc.url.txt")

???????

?

if __name__=='__main__':

??? geturl('历年英语真题')

?

第二步骤代码

1、读取txt,依次访问文档url,将url中的文档,保存到本地

# -*- coding: utf-8 ?-*-

import requests,traceback

import re

import json

from utils.FileUtils import getFileContentList, ?writeAutoEncodeContentStr2File

headers = {

??? "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P ?Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 ?Mobile Safari/537.36"

}? # 模拟手机

?

?

def get_num(url):

??? response = requests.get(url, ?headers=headers).text

#???? print response

??? if"md5sum"notin response:

??????? return

???

??? result = re.search(

??????? r'&md5sum=(.*)&sign=(.*)&rtcs_flag=(.*)&rtcs_ver=(.*?)".*rsign":"(.*?)",', response, re.M | re.I)? # 寻找参数

?

??? reader = {

??????? "md5sum": result.group(1),

??????? "sign": result.group(2),

??????? "rtcs_flag": result.group(3),

??????? "rtcs_ver": result.group(4),

??????? "width": 176,???

??????? "type": "org",

??????? "rsign": result.group(5)

??? }

#???? print reader

??? result_page = re.findall(

??????? r'merge":"(.*?)".*?"page":(.*?)}', response)? # 获取每页的标签

??? doc_url = "https://wkretype.bdimg.com/retype/merge/" + url[29:-5]? # 网页的前缀

??? n = 0

??? for i in range(len(result_page)):? # 最大同时一次爬取10

??????? if i % 10is0:

??????????? doc_range = '_'.join([k for k, v in result_page[n:i]])

??????????? reader['pn'] = n + 1

??????????? reader['rn'] = 10

??????????? reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_10' % (

??????????????? reader.get('pn'))

??????????? reader['range'] = doc_range

??????????? n = i

??????????? get_page(doc_url, reader)

??? else:? # 剩余不足10页的

??????? doc_range = '_'.join([k for k, v in result_page[n:i + 1]])

??????? reader['pn'] = n + 1

??????? reader['rn'] = i - n + 1

??????? reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_%s' % (

??????????? reader.get('pn'), reader.get('rn'))

? ??????reader['range'] = doc_range

??????? get_page(doc_url, reader)

?

?

def get_page(url, data):

??? response = requests.get(url, ?headers=headers, params=data).text

??? response = response.encode(

??????? 'utf-8').decode('unicode_escape')? # unciode转为utf-8 然后转为中文

??? response = re.sub(r',"no_blank":true', '', response)? # 清洗数据

??? result = re.findall(r'c":"(.*?)"}', response)? # 寻找文本匹配

??? result = '\n'.join(result)

??? title = result.split('\n')[0]

??? print title

??? writeAutoEncodeContentStr2File(result, title[:30] + ".txt")

?

def downloadFile():

??? content = getFileContentList("wenku.doc.url.txt")

???

??? for url in content:

??????? try:

??????????? get_num(url)

??????? except:

??????????? print'traceback.print_exc():'; ?traceback.print_exc()

??????????? continue??? ?

?

if __name__ == '__main__':

??? downloadFile()

?

?



—————END—————


上一篇这6个让孩子集中注意力的方法,美国老师都在用(转给老师家长)

下一篇太平天国的五大名将排行榜,他们每一位都是铁血铮铮的汉子

相关文章:

饮食本月排行

饮食精选