Scrapy框架简单应用:爬取免费的的西刺代理IP

基本信息

源码名称：Scrapy框架简单应用:爬取免费的的西刺代理IP

源码大小：0.01M

文件格式：.zip

开发语言：Python

更新时间：2018-06-29

友情提示：（无需注册或充值，赞助后即可获取资源下载链接）

嘿，亲！知识可是无价之宝呢，但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下，绝对物超所值哦！如有下载和支付问题，请联系我们QQ(微信同号)：813200300

本次赞助数额为： 2 元　

源码介绍

利用Scrapy框架爬取免费的的西刺代理IP

内含模拟浏览器，利用代理池反反爬虫

pipelines.py

from urllib.request import ProxyHandler, build_opener

import re

from urllib import request

''' 本段注释代码，将获取的信息保存到MongoDB数据库

client = MongoClient('localhost', 27017)

db_auth = client.admin db_auth.authenticate("root", "123")

db = client['代理'] collection = db['IP代理'] '''

'''以文本方式保存到本地''' f = open('可用代理.txt','w',encoding='utf-8')

class ProxyPipeline(object):

    def process_item(self, item, spider):

         for i in range(0,len(item["IP"])):

            #print(item['name'][i])

            proxy = item['IP'][i] ':' item["port"][i]

       proxy_handler = ProxyHandler({'http': 'http://'  proxy})
       opener = build_opener(proxy_handler)

    try:
           head = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', }
                url = 'http://www.xdaili.cn/monitor'

        req = request.Request(url, headers=head)
              response = opener.open(req)
                 data = response.read().decode('utf-8') # print(data)

         if data:

            '''保存到数据库  ret = collection.find_one({'IP': item['IP'][i], '端口': item["port"][i]})

                    if ret:  # print('已经存在了')  pass

                    else:  collection.save({'IP': item['IP'][i], '端口': item['port'][i], '匿名度': item['nmd'][i],  '类型': item['type'][i], '位置': item['addr'][i], '最后验证时间': item['lastime'][i]})  # 向数据库插入一条记录  '''

                        f.write( proxy)
                                  f.write("\n")

                        print(proxy)

    except Exception as e:

    #print(proxy "不可用")  pass

getip.py

 
import scrapy from proxy.items import ProxyItem import re class GetipSpider(scrapy.Spider):
    name = 'getip'  allowed_domains = ['xicidaili.com']
    start_urls = [ 'http://www.xicidaili.com/wn/',  'http://www.xicidaili.com/wt/',  ] def parse(self, response):
        item = ProxyItem()
        item["IP"] = response.xpath("//tr[@class = 'odd']/td[2]/text()").extract()
        item["port"] = response.xpath("//tr[@class = 'odd']/td[3]/text()").extract()
        item["nmd"] = response.xpath("//tr[@class = 'odd']/td[5]/text()").extract()
        item["type"] = response.xpath("//tr[@class = 'odd']/td[6]/text()").extract()
        item["addr"] = response.xpath("//tr[@class = 'odd']/td[4]/a/text()").extract()
        item["lastime"] = response.xpath("//tr[@class = 'odd']/td[10]/text()").extract() #print(item)  yield item #rang = response.xpath("//div[@id = 'listnav']/ul/li[6]/a/text()").extract()        #下一页，前5页最后验证时间临近的IP  #print(int(rang[0]) 1)  type = response.xpath("//div[@class='pagination']/a[last()]/@href").extract()[0]
        type = re.findall(r'/(.*?)/',type,re.S) #print(type)  for i in range(2,4):
            next_page = response.urljoin('http://www.xicidaili.com/%s/%s/') % (type[0],i) print(next_page) #next_page = response.urljoin('https://www.kuaidaili.com%s%s/') % type % i  yield scrapy.Request(next_page, callback=self.parse)