基本信息
源码名称:python 微博爬虫 示例源码(lxml)
源码大小:7.81KB
文件格式:.py
开发语言:Python
更新时间:2018-04-22
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
需要创建 D:/weibo/weibo_crawl.txt 文件,然后运行该示例即可
需要创建 D:/weibo/weibo_crawl.txt 文件,然后运行该示例即可
# -*- coding:utf-8 -*- ''' Created on 2018年3月9日 @author: ora_jason ''' from lxml import html import requests import json import re import os import time import urllib.request class CrawlWeibo:# 获取指定博主的所有微博cards的list def getCards(self, id, page): # id(字符串类型):博主的用户id;page(整型):微博翻页参数 ii = 0 list_cards = [] while ii < page: ii = ii 1 print('正在爬取第%d页cards' % ii) url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' id '&containerid=107603' id '&page=' str(ii) print(url) response = requests.get(url, headers=headers) ob_json = json.loads(response.text) # ob_json为dict类型 list_cards.append(ob_json['data']['cards']) # ob_json['data']['cards']为list类型 time.sleep(2) print('暂停2秒') # 爬完一页所有微博的cards后 停顿两秒 return list_cards# 返回所有页的cards # 获取某条微博的热门评论或评论的list def getComments(self, id, page): # id(字符串类型):某条微博的id;page(整型):评论翻页参数 url = 'https://m.weibo.cn/api/comments/show?id=' id '&page=' str(page) response = requests.get(url, headers=headers) ob_json = json.loads(response.text) list_comments = [] if 'data' in ob_json: if 'hot_data' in ob_json['data']: list_comments = ob_json['data']['hot_data'] else: list_comments = ob_json['data']['data'] return list_comments# 返回某条微博下评论 def getAll(self, id, page, path): # id为博主uid,page为爬取页数,path为保存路径 list_cards = self.getCards(id, page) print('爬取页数为:' str(len(list_cards)) '\n' 30 * '-') count_weibo = 1 page_weibo = 1 # 遍历当页所有微博,保存内容,并根据id查找输出热门评论 for cards in list_cards: for card in cards: if card['card_type'] == 9: # 过滤出微博 #if card['card_type'] == 9 and 'raw_text' not in card['mblog']: # 过滤出原创微博 print('正在爬取第' str(page_weibo) '页 第' str(count_weibo) '条card') mid = card['mblog']['id'] created_at = card['mblog']['created_at'] # 获取保存文本信息 if card['mblog']['isLongText'] == 'false': text = card['mblog']['text'] else: url = 'https://m.weibo.cn/statuses/extend?id=' mid response = requests.get(url, headers=headers) ob_json = json.loads(response.text) # ob_json为dict类型 text = ob_json['data']['longTextContent'] tree = html.fromstring(text) text = tree.xpath('string(.)') # 用string函数过滤掉多余标签 # 输出微博文本 with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: ff.write('第' str(count_weibo) '条\n' '*** 发布于 ' created_at ' ***' '\n') ff.write(text '\n') # 获取保存图片 if 'bmiddle_pic' in card['mblog']: image_path = path str(count_weibo) # if os.path.exists(image_path) is False: os.mkdir(image_path) url_extend = 'https://m.weibo.cn/status/' mid # 单条微博url res = requests.get(url_extend, headers=headers).text # str类型 imgurl_weibo = re.findall('https://.*large.*.jpg', res) # 用正则匹配到图片url x = 1 print(imgurl_weibo) for i in range(len(imgurl_weibo)): temp = image_path '/' str(x) '.jpg' # 将图片url添加到微博文本中 with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: ff.write('微博图片链接:' imgurl_weibo[i] '\n') print('正在下载该条微博 第%s张图片' % x) try: urllib.request.urlretrieve(urllib.request.urlopen(imgurl_weibo[i]).geturl(), temp) except: print("该图片下载失败:%s" % imgurl_weibo) x = 1 with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: ff.write(78 * '-' '评论' '>' 78 * '-' '\n') else: with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: ff.write(78 * '-' '评论' '>' 78 * '-' '\n') count_weibo = count_weibo 1 # 根据微博id获取热门评论,并输出 list_comments = self.getComments(mid, 1) # 评论只需要访问第一页 print('正在爬取该条微博评论') count_hotcomments = 1 for comment in list_comments: # like_counts = comment['like_counts'] # 点赞数 text = comment['text'] # 评论内容 tree = html.fromstring(text) text = tree.xpath('string(.)') # 用string函数过滤掉多余标签 name_user = comment['user']['screen_name'] # 评论者的用户名 # 输出评论数据 if count_hotcomments<len(list_comments): with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: result = str(count_hotcomments) ': #' name_user '#' ff.write(result '\n') ff.write(text '\n\n') else: with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: result = str(count_hotcomments) ': #' name_user '#' ff.write(result '\n') ff.write(text '\n') count_hotcomments = count_hotcomments 1 with open(path 'weibo_crawl.txt', 'a', encoding='utf-8') as ff: ff.write(78 * '-' '<' '评论' 78 * '-' '\n\n\n\n') #time.sleep(2) print('暂停2秒\n') # 爬完一条微博的所有内容后 停顿两秒 page_weibo = page_weibo 1 # 请求头,爬取新博主需更新Cookie和Referer headers = { 'Accept':'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', #'Cookie': '_T_WM=5a5b9ae925e458f93279d6708b159927; ALF=1523107054; SCF=Ativ2ybI8StjZccoSRca_uyzfWFIcM45JHEaLQ_tD8ksmi6-whOM5Pl1p8Vz4EziyMQe5QgrSlo8RY9Nd3NiFO8.; SUB=_2A253pUizDeRhGeRG61EV9S_NwzuIHXVVZmj7rDV6PUJbktANLXD4kW1NTeA_GStZpY6CFmR1PzgN50YL186u9HbC; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh2V9E8BT6Glu5-SxvF-MwO5JpX5K-hUgL.FozReheXSK2p1hM2dJLoI7D29PyXUGxXUsHE; SUHB=02M_R-ArnK_FEZ; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D3900009063730800%26luicode%3D10000011%26lfid%3D1076031195054531%26fid%3D1005051195054531%26uicode%3D10000011', 'Cookie':'_T_WM=8d29214da8ba1494873830fceb25abf1; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D1076031195054531', 'Host': 'm.weibo.cn', # 'qq': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', 'Referer': 'https://m.weibo.cn/u/1195054531', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } crawl_weibo = CrawlWeibo() # 实例化爬虫类并调用成员方法进行输出 crawl_weibo.getAll('1195054531', 3 , 'D:/weibo/') # 输入需要爬取用户uid,需要爬取微博页数,微博本地保存路径