python 微博爬虫示例源码(lxml)

基本信息
源码名称：python 微博爬虫示例源码(lxml)
源码大小：7.81KB
文件格式：.py
开发语言：Python
更新时间：2018-04-22
友情提示：（无需注册或充值，赞助后即可获取资源下载链接）
嘿，亲！知识可是无价之宝呢，但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下，绝对物超所值哦！如有下载和支付问题，请联系我们QQ(微信同号)：813200300
本次赞助数额为： 2 元　
源码介绍
需要创建 D:/weibo/weibo_crawl.txt 文件，然后运行该示例即可
# -*- coding:utf-8 -*-
'''
Created on 2018年3月9日

@author: ora_jason
'''
from lxml import html
import requests
import json
import re
import os
import time
import urllib.request


class CrawlWeibo:# 获取指定博主的所有微博cards的list
  def getCards(self, id, page):  # id（字符串类型）：博主的用户id；page（整型）：微博翻页参数
   ii = 0
   list_cards = []
   while ii < page:
            ii = ii   1
   print('正在爬取第%d页cards' % ii)
   url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='   id   '&containerid=107603'   id   '&page='   str(ii)

   print(url)
   response = requests.get(url, headers=headers)
   ob_json = json.loads(response.text)  # ob_json为dict类型

   list_cards.append(ob_json['data']['cards'])  # ob_json['data']['cards']为list类型
   time.sleep(2)
   print('暂停2秒')  # 爬完一页所有微博的cards后 停顿两秒
   return list_cards# 返回所有页的cards

    # 获取某条微博的热门评论或评论的list

  def getComments(self, id, page):  # id（字符串类型）：某条微博的id；page（整型）：评论翻页参数
    url = 'https://m.weibo.cn/api/comments/show?id='   id   '&page='   str(page)
    response = requests.get(url, headers=headers)
    ob_json = json.loads(response.text)

    list_comments = []
    if 'data' in ob_json:
      if 'hot_data' in ob_json['data']:
          list_comments = ob_json['data']['hot_data']
      else:
          list_comments = ob_json['data']['data']
    return list_comments# 返回某条微博下评论

  def getAll(self, id, page, path):  # id为博主uid，page为爬取页数，path为保存路径
    list_cards = self.getCards(id, page)
    print('爬取页数为：'   str(len(list_cards))   '\n'   30 * '-')
    count_weibo = 1
    page_weibo = 1 # 遍历当页所有微博，保存内容，并根据id查找输出热门评论
    for cards in list_cards:
       for card in cards:
          if card['card_type'] == 9:  # 过滤出微博
       #if card['card_type'] == 9 and 'raw_text' not in card['mblog']:  # 过滤出原创微博
            print('正在爬取第'   str(page_weibo)   '页 第'   str(count_weibo)   '条card')
            mid = card['mblog']['id']
            created_at = card['mblog']['created_at']
       # 获取保存文本信息
            if card['mblog']['isLongText'] == 'false':
               text = card['mblog']['text']
            else:
               url = 'https://m.weibo.cn/statuses/extend?id='   mid

            response = requests.get(url, headers=headers)


            ob_json = json.loads(response.text)  # ob_json为dict类型
            text = ob_json['data']['longTextContent']
            tree = html.fromstring(text)
            text = tree.xpath('string(.)')  # 用string函数过滤掉多余标签
            # 输出微博文本
            with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                  ff.write('第'   str(count_weibo)   '条\n'   '***  发布于  '   created_at   '  ***'   '\n')
                  ff.write(text   '\n')

       # 获取保存图片
            if 'bmiddle_pic' in card['mblog']:
                image_path = path   str(count_weibo)
       # if os.path.exists(image_path) is False:
                os.mkdir(image_path)
                url_extend = 'https://m.weibo.cn/status/'   mid  # 单条微博url
                res = requests.get(url_extend, headers=headers).text  # str类型
                imgurl_weibo = re.findall('https://.*large.*.jpg', res)  # 用正则匹配到图片url
                x = 1
                print(imgurl_weibo)
                for i in range(len(imgurl_weibo)):
                    temp = image_path   '/'   str(x)   '.jpg'

# 将图片url添加到微博文本中
                with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                    ff.write('微博图片链接：'   imgurl_weibo[i]   '\n')
                    print('正在下载该条微博 第%s张图片' % x)
                    try:
                        urllib.request.urlretrieve(urllib.request.urlopen(imgurl_weibo[i]).geturl(), temp)
                    except:
                        print("该图片下载失败:%s" % imgurl_weibo)
                x  = 1
                with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                    ff.write(78 * '-'   '评论'   '>'   78 * '-'   '\n')
            else:
                with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                      ff.write(78 * '-'   '评论'   '>'   78 * '-'   '\n')
            count_weibo = count_weibo   1

# 根据微博id获取热门评论，并输出
            list_comments = self.getComments(mid, 1)  # 评论只需要访问第一页
            print('正在爬取该条微博评论')
            count_hotcomments = 1
            for comment in list_comments:
# like_counts = comment['like_counts']  # 点赞数
                text = comment['text']  # 评论内容
                tree = html.fromstring(text)
                text = tree.xpath('string(.)')  # 用string函数过滤掉多余标签
                name_user = comment['user']['screen_name']  # 评论者的用户名
# 输出评论数据
            if count_hotcomments<len(list_comments):
                with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                  result = str(count_hotcomments)   ': #'   name_user   '#'
                  ff.write(result   '\n')
                  ff.write(text   '\n\n')
            else:
                with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                  result = str(count_hotcomments)   ': #'   name_user   '#'
                  ff.write(result   '\n')
                  ff.write(text   '\n')
                  count_hotcomments = count_hotcomments   1
            with open(path   'weibo_crawl.txt', 'a', encoding='utf-8') as ff:
                  ff.write(78 * '-'   '<'   '评论'   78 * '-'   '\n\n\n\n')
                  #time.sleep(2)
                  print('暂停2秒\n')  # 爬完一条微博的所有内容后 停顿两秒
                  page_weibo = page_weibo   1


# 请求头，爬取新博主需更新Cookie和Referer
headers = {
    'Accept':'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    #'Cookie': '_T_WM=5a5b9ae925e458f93279d6708b159927; ALF=1523107054; SCF=Ativ2ybI8StjZccoSRca_uyzfWFIcM45JHEaLQ_tD8ksmi6-whOM5Pl1p8Vz4EziyMQe5QgrSlo8RY9Nd3NiFO8.; SUB=_2A253pUizDeRhGeRG61EV9S_NwzuIHXVVZmj7rDV6PUJbktANLXD4kW1NTeA_GStZpY6CFmR1PzgN50YL186u9HbC; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh2V9E8BT6Glu5-SxvF-MwO5JpX5K-hUgL.FozReheXSK2p1hM2dJLoI7D29PyXUGxXUsHE; SUHB=02M_R-ArnK_FEZ; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D3900009063730800%26luicode%3D10000011%26lfid%3D1076031195054531%26fid%3D1005051195054531%26uicode%3D10000011',
    'Cookie':'_T_WM=8d29214da8ba1494873830fceb25abf1; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D1076031195054531',
    'Host': 'm.weibo.cn',
    # 'qq': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
    'Referer': 'https://m.weibo.cn/u/1195054531',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}


crawl_weibo = CrawlWeibo()  # 实例化爬虫类并调用成员方法进行输出
crawl_weibo.getAll('1195054531', 3 , 'D:/weibo/')  # 输入需要爬取用户uid，需要爬取微博页数，微博本地保存路径