会员登录|免费注册|忘记密码|管理入口 返回主站||保存桌面
python大作业-基于python实现微信公众号文章爬虫源码+详细代码注释+项目说明.zip
2024-12-18IP属地 湖北1
# 由于微信验证方式的原因,只能爬取文章,不能爬取评论和点赞数等详细信息

python大作业-基于python实现微信公众号文章爬虫源码+详细代码注释+项目说明.zip

import json import requests import re import copy import time import csv import os import traceback import urllib import requests class NoQuoteSession(requests.Session): #这个类用于解决requests.get会自动编码=等特殊字符的问题(copy来的,分析过程深入库的源码,看不懂也懒得看了) def send(self, prep, **send_kwargs): table = { urllib.parse.quote('{'): '{', urllib.parse.quote('}'): '}', urllib.parse.quote(':'): ':', urllib.parse.quote(','): ',', urllib.parse.quote('='): '=', urllib.parse.quote('%'): '%', } for old, new in table.items(): prep.url = prep.url.replace(old, new) return super(NoQuoteSession, self).send(prep, **send_kwargs) class weixin_spider(): def __init__(self): self.msgcookie = {} #为提取不同行为用的不同cookie分配空间 self.msgparams = {} self.msgheaders = {} self.headers = {} self.prefix = 'https://mp.weixin.qq.com/mp/' #网址前缀 self.suffix = '' #网址后缀 self.url = '' #完整网址 self.msgparams['offset'] = 0 #默认偏移量为0 self.msgparams['count'] = 10 #默认步进为10 self.json_data = {} self.data_decoded = [] #完成一个完整的数据提取循环的数据 self.data_decoded_temp = [] #临时存储当前循环的数据,网址返回错误信息时丢弃,防止污染已提取数据 self.id = 0 #初始化程序内部ID def information_init(self): print("注意,下列输入均输入至cookie行(包含该行)结束,如果有多余数据请勿输入,防止数据污染") print("请输入action = getmsg的header") self.decode_headers('getmsg') def decode_url(self,url,flag): #获取网址信息函数,flag请填写成'getmsg' temp = url.split('?') #将网址数据区剥离 data = temp[1] #提取数据 temp = data.split('&') #将数据分块 for i in temp: data = i.split('=',1) #提取变量名和值 if flag == 'getmsg': self.msgparams[data[0]] = data[1] #向数据字典中添加数据 def decode_cookie(self,cookie,flag): #获取cookie函数(cookie需要ctrl+c复制自fiddler),cookie形如 Cookie: #rewardsn=;wxtokenkey=777; temp = cookie.split(': ') #将cookie分块 cookie = temp[1] #去掉无用的Cookie: 前缀 temp = cookie.split('; ') for i in temp: #处理所有cookie数据 temp2 = i.split('=',1) if flag == 'getmsg': self.msgcookie[temp2[0]] = temp2[1] def decode_headers(self,flag): #分析header的raw数据,flag是一个已经废弃的参数,请填写'getmsg' temp = 'NULL' while(temp != ''): try: temp = input() if ('GET' in temp) or ('POST' in temp): #属于params的部分 temp = temp.replace('HTTP/1.1','') self.decode_url(temp,flag) elif 'Cookie' in temp: self.decode_cookie(temp,flag) else: #属于header的部分 temp = temp.split(': ') if flag == 'getmsg': self.msgheaders[temp[0]] = temp[1] except: break def decode_response_getmsg(self,response): #解析action = getmsg时获取的json self.json_data = json.loads(response.text) if self.json_data['ret'] != 0: print("出现错误,程序正在退出,data = {:}".format(self.json_data)) raise Exception('返回结果有误,请检查header是否有效') self.json_urls = json.loads(self.json_data['general_msg_list']) def decode_list(self): #解析decode_response得到的json并提取相应数据存到data_decoded_temp num = 0 for i in self.json_urls['list']: num+=1 data = i['comm_msg_info'] self.data_decoded_temp = {} self.data_decoded_temp['inside_id'] = self.id #程序内部的ID self.id+=1 self.data_decoded_temp['id'] = data['id'] #文章在微信中的ID self.data_decoded_temp['type'] = data['type'] self.data_decoded_temp['datetime'] = data['datetime'] self.data_decoded_temp['time'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(data['datetime'])) self.data_decoded_temp['fakeid'] = data['fakeid'] self.data_decoded_temp['status'] = data['status'] try: #有些单篇文章无该字段 data = i['app_msg_ext_info'] except: self.data_decoded_temp['title'] = data['content'] self.data_decoded_temp['digest'] = '' self.data_decoded_temp['fileid'] = '' self.data_decoded_temp['content_url'] = '' self.data_decoded_temp['source_url'] = '' self.data_decoded_temp['cover'] = '' self.data_decoded_temp['subtype'] = '' self.data_decoded_temp['author'] = '' print("{:^21} {:^12} {:^14} {:>20} ".format('当前系统时间','推送时间','作者','标题')) print("{systime:^24} {datetime:^19} {author:<{len1}} {title:}".format( systime = time.ctime(time.time()),datetime = self.data_decoded_temp['time'], author = self.data_decoded_temp['author'],len1 = 16-len(self.data_decoded_temp['author']), title = self.data_decoded_temp['title'])) continue self.data_decoded_temp['title'] = data['title'].replace('&amp','&') #文章标题 self.data_decoded_temp['digest'] = data['digest'] self.data_decoded_temp['fileid'] = data['fileid'] self.data_decoded_temp['content_url'] = data['content_url'] #含有用户信息的临时链接 self.data_decoded_temp['source_url'] = data['source_url'] #文章永久链接 self.data_decoded_temp['cover'] = data['cover'] self.data_decoded_temp['subtype'] = data['subtype'] self.data_decoded_temp['author'] = data['author'] if self.data_decoded_temp['cover'] != '': self.get_icon(self.data_decoded_temp['cover'],self.id - 1) #下载图片并命名为 文章标题.png self.data_decoded.append(copy.deepcopy(self.data_decoded_temp)) print("{:^21} {:^12} {:^14} {:>20}