# 由于微信验证方式的原因,只能爬取文章,不能爬取评论和点赞数等详细信息
i
mport json
i
mport requests
i
mport re
i
mport copy
i
mport time
i
mport csv
i
mport os
i
mport traceback
i
mport urllib
i
mport requests
class NoQuoteSession(requests.Session): #这个类用于解决requests.get会自动编码=等特殊字符的问题(copy来的,分析过程深入库的源码,看不懂也懒得看了)
def send(self, prep, **send_kwargs):
table = {
urllib.parse.quote('{'): '{',
urllib.parse.quote('}'): '}',
urllib.parse.quote(':'): ':',
urllib.parse.quote(','): ',',
urllib.parse.quote('='): '=',
urllib.parse.quote('%'): '%',
}
for old, new in table.items():
prep.url = prep.url.replace(old, new)
return super(NoQuoteSession, self).send(prep, **send_kwargs)
class weixin_spider():
def __init__(self):
self.msgc
ookie = {} #为提取不同行为用的不同c
ookie分配空间
self.msgparams = {}
self.msgheaders = {}
self.headers = {}
self.prefix = 'https://mp.weixin.qq.com/mp/' #网址前缀
self.suffix = '' #网址后缀
self.url = '' #完整网址
self.msgparams['offset'] = 0 #默认偏移量为0
self.msgparams['count'] = 10 #默认步进为10
self.json_data = {}
self.data_decoded = [] #完成一个完整的数据提取循环的数据
self.data_decoded_temp = [] #临时存储当前循环的数据,网址返回错误信息时丢弃,防止污染已提取数据
self.id = 0 #初始化程序内部ID
def information_init(self):
print("注意,下列输入均输入至c
ookie行(包含该行)结束,如果有多余数据请勿输入,防止数据污染")
print("请输入action = getmsg的header")
self.decode_headers('getmsg')
def decode_url(self,url,flag): #获取网址信息函数,flag请填写成'getmsg'
temp = url.split('?') #将网址数据区剥离
data = temp[1] #提取数据
temp = data.split('&') #将数据分块
for i in temp:
data = i.split('=',1) #提取变量名和值
if flag == 'getmsg':
self.msgparams[data[0]] = data[1] #向数据字典中添加数据
def decode_c
ookie(self,c
ookie,flag): #获取c
ookie函数(c
ookie需要ctrl+c复制自fiddler),c
ookie形如 C
ookie:
#rewardsn=;wxtokenkey=777;
temp = c
ookie.split(': ') #将c
ookie分块
c
ookie = temp[1] #去掉无用的C
ookie: 前缀
temp = c
ookie.split('; ')
for i in temp: #处理所有c
ookie数据
temp2 = i.split('=',1)
if flag == 'getmsg':
self.msgc
ookie[temp2[0]] = temp2[1]
def decode_headers(self,flag): #分析header的raw数据,flag是一个已经废弃的参数,请填写'getmsg'
temp = 'NULL'
while(temp != ''):
try:
temp = input()
if ('GET' in temp) or ('POST' in temp): #属于params的部分
temp = temp.replace('HTTP/1.1','')
self.decode_url(temp,flag)
elif 'C
ookie' in temp:
self.decode_c
ookie(temp,flag)
else: #属于header的部分
temp = temp.split(': ')
if flag == 'getmsg':
self.msgheaders[temp[0]] = temp[1]
except:
break
def decode_response_getmsg(self,response): #解析action = getmsg时获取的json
self.json_data = json.loads(response.text)
if self.json_data['ret'] != 0:
print("出现错误,程序正在退出,data = {:}".format(self.json_data))
raise Exception('返回结果有误,请检查header是否有效')
self.json_urls = json.loads(self.json_data['general_msg_list'])
def decode_list(self): #解析decode_response得到的json并提取相应数据存到data_decoded_temp
num = 0
for i in self.json_urls['list']:
num+=1
data = i['comm_msg_info']
self.data_decoded_temp = {}
self.data_decoded_temp['inside_id'] = self.id #程序内部的ID
self.id+=1
self.data_decoded_temp['id'] = data['id'] #文章在微信中的ID
self.data_decoded_temp['type'] = data['type']
self.data_decoded_temp['datetime'] = data['datetime']
self.data_decoded_temp['time'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(data['datetime']))
self.data_decoded_temp['fakeid'] = data['fakeid']
self.data_decoded_temp['status'] = data['status']
try: #有些单篇文章无该字段
data = i['app_msg_ext_info']
except:
self.data_decoded_temp['title'] = data['content']
self.data_decoded_temp['digest'] = ''
self.data_decoded_temp['fileid'] = ''
self.data_decoded_temp['content_url'] = ''
self.data_decoded_temp['source_url'] = ''
self.data_decoded_temp['cover'] = ''
self.data_decoded_temp['subtype'] = ''
self.data_decoded_temp['author'] = ''
print("{:^21} {:^12} {:^14} {:>20}
".format('当前系统时间','推送时间','作者','标题'))
print("{systime:^24} {datetime:^19} {author:<{len1}} {title:}".format(
systime = time.ctime(time.time()),datetime = self.data_decoded_temp['time'],
author = self.data_decoded_temp['author'],len1 = 16-len(self.data_decoded_temp['author']),
title = self.data_decoded_temp['title']))
co
ntinue
self.data_decoded_temp['title'] = data['title'].replace('&','&') #文章标题
self.data_decoded_temp['digest'] = data['digest']
self.data_decoded_temp['fileid'] = data['fileid']
self.data_decoded_temp['content_url'] = data['content_url'] #含有用户信息的临时链接
self.data_decoded_temp['source_url'] = data['source_url'] #文章永久链接
self.data_decoded_temp['cover'] = data['cover']
self.data_decoded_temp['subtype'] = data['subtype']
self.data_decoded_temp['author'] = data['author']
if self.data_decoded_temp['cover'] != '':
self.get_icon(self.data_decoded_temp['cover'],self.id - 1) #下载图片并命名为 文章标题.png
self.data_decoded.append(copy.deepcopy(self.data_decoded_temp))
print("{:^21} {:^12} {:^14} {:>20}