基本信息
源码名称:python 天气网爬虫(爬取天气预报)
源码大小:0.01M
文件格式:.py
开发语言:Python
更新时间:2019-06-22
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
爬取天气网数据 显示登陆,爬虫界面并对数据进行统计
# -*- coding:utf-8 -*- import requests import mysql.connector from bs4 import BeautifulSoup import queue, json import matplotlib.pyplot as plt import numpy as np import time from threading import Thread from pyquery import PyQuery as pq import csv from tkinter import * import tkinter as tk start = '' class TianQiSpider(object): def __init__(self): self.start_url = 'http://www.weather.com.cn/textFC/hunan.shtml' self.headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Mobile Safari/537.36' } # 构建队列,供线程之间传输数据使用 self.url_queue = queue.Queue() self.html_queue = queue.Queue() self.data_queue = queue.Queue() def get_url(self): print('数据采集中') # 发起请求,获取所有城市目标的url地址,构建url列表 html = requests.get(url=self.start_url, headers=self.headers).content.decode() soup = BeautifulSoup(html, 'html5lib') # 获取所有的url地址 div = soup.find(name='div', class_='lqcontentBoxheader') a_list = div.find_all(name='a') for a in a_list: self.url_queue.put('http://www.weather.com.cn' a['href']) def send_requests(self): '''发起请求获取页面数据''' # 从队列中读取url地址 while self.url_queue.qsize(): url = self.url_queue.get() # 发送请求 获取响应 response = requests.get(url=url, headers=self.headers) # 获取html页面 html = response.content.decode() # print(html) # 将html页面加入队列中 self.html_queue.put(html) # 通知url队列,获取出来的数据使用完毕 self.url_queue.task_done() def get_data(self): '''使用bs4提取页面数据''' while self.html_queue.qsize() or self.url_queue.qsize(): html = self.html_queue.get() soup = BeautifulSoup(html, 'html5lib') # table= soup.find(name='div',class_="conMidtab").find_all(name = 'table')[1] # #获取当前地区,所有的城市 # tr_list = table.find_all(name='tr') tr_list = soup.find(name='div', class_="conMidtab").find_all(name='tr')[2:] # 遍历,获取每个城市的数据 for tr in tr_list: city = tr.find(name='td').get_text().strip() tian = tr.find_all(name='td')[-4].get_text() wind = tr.find_all(name='td')[-3].get_text().split() wind = ' '.join(wind) max = tr.find_all(name='td')[-5].get_text() min = tr.find_all(name='td')[-2].string data = dict( city=city, tian=tian, wind=wind, max=max, min=min ) self.data_queue.put(data) self.html_queue.task_done() def save_data(self): # '''存储数据''' # 从队列中读取数据 data_num = 0 while self.url_queue.qsize() or self.data_queue.qsize() or self.html_queue.qsize(): data = self.data_queue.get() # 转换成json类型的数据 json_data = json.dumps(data, ensure_ascii=False) # 写入文件中 if data_num == 0: with open('tian.json', 'w', encoding='utf8') as f: f.write(json_data '\n') data_num = 1 self.data_queue.task_done() else: with open('tian.json', 'a', encoding='utf8') as f: f.write(json_data '\n') data_num = 1 self.data_queue.task_done() with open('log.txt', 'w', encoding='utf8') as f: f.write("爬取了%d条数据\n" % (data_num)) print('数据采集完成') print("爬取了%d条数据" % (data_num)) def run(self): # 创建线程列表 thead_list = [] start = time.time() self.get_url() for i in range(5): thead_list.append(Thread(target=self.send_requests)) for i in range(1): thead_list.append(Thread(target=self.get_data)) for i in range(1): thead_list.append(Thread(target=self.save_data)) for t in thead_list: t.start() for t in thead_list: t.join() with open('log.txt', 'a', encoding='utf8') as f: f.write("本次数据采集用时%d秒\n" % (time.time() - start)) f.write("采用了%d个线程数\n" % (len(thead_list))) print("本次数据采集用时%d秒" % (time.time() - start)) print("采用了%d个线程数" % (len(thead_list))) def crawl(): url = url_input.get() headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Mobile Safari/537.36'} file = csv.writer(open('天气.csv', 'w')) print('在解析网址中:', url) html = requests.get(url=url, headers=headers).content.decode() soup = BeautifulSoup(html, 'html5lib') # #获取当前地区,所有的城市 tr_list = soup.find(name='div', class_="conMidtab").find_all(name='tr')[0:] provinces = soup.find(name='div', class_="conMidtab").find_all(name='td', class_="rowsPan")[0:] province_num = 0; for tr in tr_list: city = tr.find(name='td').get_text().strip() tian = tr.find_all(name='td')[-4].get_text() wind = tr.find_all(name='td')[-3].get_text().split() wind = ' '.join(wind) max = tr.find_all(name='td')[-5].get_text() min = tr.find_all(name='td')[-2].string if city == '省/直辖市': province = provinces[province_num].find(name='a').get_text().strip() text.insert(END, province) province_num = 1 continue elif city == '天气现象': continue # 添加数据 text.insert(END, "城市:" city " 天气现象:" tian " 风力风向:" wind " 最高温度" max "℃ 最低温度" min "℃") # 文本框向下滚动 text.see(END) # 更新 text.update() print('已抓取完毕') class Login(object): def __init__(self): # 创建主窗口,用于容纳其它组件 self.root = tk.Tk() # 给主窗口设置标题内容 self.root.title("测试天气网") self.root.geometry('450x300') #创建一个`label`名为`Account: ` self.label_account = tk.Label(self.root, text='Account: ') #创建一个`label`名为`Password: ` self.label_password = tk.Label(self.root, text='Password: ') # 创建一个账号输入框,并设置尺寸 self.input_account = tk.Entry(self.root, width=30) # 创建一个密码输入框,并设置尺寸 self.input_password = tk.Entry(self.root, show='*', width=30) #创建一个登录系统的按钮 self.login_button = tk.Button(self.root, command = self.backstage_interface, text = "Login", width=10) # 创建一个注册系统的按钮 self.siginUp_button = tk.Button(self.root, command = self.siginUp_interface, text = "Sign up", width=10) # 完成布局 def gui_arrang(self): self.label_account.place(x=60, y= 170) self.label_password.place(x=60, y= 195) self.input_account.place(x=135, y=170) self.input_password.place(x=135, y=195) self.login_button.place(x=140, y=235) self.siginUp_button.place(x=240, y=235) #进入注册界面 def siginUp_interface(self): # self.root.destroy() tk.messagebox.showinfo(title='', message='进入注册界面') # 进行登录信息验证 def backstage_interface(self): account = self.input_account.get().ljust(10," ").strip() password = self.input_password.get().ljust(10," ").strip() #对账户信息进行验证,普通用户返回user,管理员返回master,账户错误返回noAccount,密码错误返回noPassword if account == 'admin' and password == '123456': print("登陆成功") self.root.destroy(); start = time.time() tianqi = TianQiSpider() tianqi.run() # 连接数据库 conn = mysql.connector.connect(user='root', password='', host='127.0.0.1', database='testPython') cursor = conn.cursor() # 创建user表: cursor.execute( 'create table weather (id varchar(20) primary key, city varchar(40),tian varchar(20), wind varchar(40), max varchar(20), min varchar(20))') plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 datas = [] f = open("tian.json", 'r', encoding='utf-8') ln = 0 for line in f.readlines(): dic = json.loads(line) datas.append(dic) sql = "insert into weather (id, city , tian, wind, max, min) values ('%s', '%s', '%s ', '%s', '%s', '%s')" % (ln 1, datas[ln]['city'], datas[ln]['tian'], datas[ln]['wind'],datas[ln]['max'], datas[ln]['min']) cursor.execute(sql) # cursor.execute('insert into weather(id, city, max, min) values ("%s","%s","%s","%s")'%('null',city[ln],maxtemp[ln],temp[ln])) cursor.rowcount 1 # 提交事务: conn.commit() ln = 1 cursor.close() conn.close() with open('log.txt', 'a', encoding='utf8') as f: f.write("总共用时%d秒" % (time.time() - start)) print("总共用时%d秒" % (time.time() - start)) city = [] temp = [] maxtemp = [] tempNum1 = 0 tempNum2 = 0 tempNum3 = 0 tempNum4 = 0 for data in datas: data['min'] = int(data['min'] ) if data['min'] <= 30 and data['min'] > 20: tempNum3 = 1 elif data['min'] <= 20 and data['min'] > 10: tempNum2 = 1 elif data['min'] <= 10: tempNum1 = 1 else: tempNum4 = 1 t = np.zeros(3) for data in datas: if data['tian'] == '多云' or data['tian'] == '阴' : t[0] = 1 elif data['tian'] == '晴': t[1] = 1 else: t[2] = 1 datas = datas[0:10] ln = 0 for data in datas: city.append(data['city']) temp.append(int(data['min'])) data['min'] = int(data['min'] ) if data['max'] == '-': maxtemp.append(int(0)) else: maxtemp.append(int(data['max'])) global url_input, text # 创建空白窗口,作为主载体 root = Tk() root.title('测试——天气') # 窗口的大小,后面的加号是窗口在整个屏幕的位置 root.geometry('550x400 398 279') # 标签控件,窗口中放置文本组件 Label(root, text='请输入测试的url:', font=("华文行楷", 20), fg='black').grid() # 定位 pack包 place位置 grid是网格式的布局 # Entry是可输入文本框 url_input = Entry(root, font=("微软雅黑", 15)) url_input.grid(row=0, column=1) Label(root, text='', font=("微软雅黑", 10), fg='black').grid(row=1) # 列表控件 text = Listbox(root, font=('微软雅黑', 15), width=45, height=10) # columnspan 组件所跨越的列数 text.grid(row=2, columnspan=2) # 设置按钮 sticky对齐方式,N S W E button = Button(root, text='开始测试', font=("微软雅黑", 15), command=crawl).grid(row=3, column=0, sticky=W) button = Button(root, text='退出', font=("微软雅黑", 15), command=root.quit).grid(row=3, column=1, sticky=E) # 使得窗口一直存在 mainloop() plt.figure() plt.title('全国城市气温统计') a = np.array(temp) b = np.array(maxtemp) plt.bar(range(len(temp)), a, label='最低气温', tick_label=city) plt.bar(range(len(temp)), b, bottom=a, label='最高气温', tick_label=city) plt.grid(True) plt.figure() plt.title('最低气温范围统计') labels = '10℃以下', '10℃~20℃', '20℃~30℃','30℃以上' sizes = tempNum1, tempNum2, tempNum3, tempNum4 colors = 'lightgreen', 'gold', 'lightskyblue', 'lightcoral' explode = 0, 0, 0, 0 plt.pie(sizes, explode=explode, labels=labels,colors=colors, autopct='%1.1f%%', shadow=True, startangle=50) plt.axis('equal') plt.grid(True) plt.figure() plt.title('全国天气统计') labels = '阴', '晴', '有雨' sizes = t[0], t[1], t[2] colors = 'lightgreen', 'gold', 'lightskyblue' explode = 0, 0, 0 plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50) plt.axis('equal') plt.grid(True) plt.legend() plt.show() else: print('登陆失败,用户名或密码不正确') if __name__ == '__main__': # 初始化对象 L = Login() # 进行布局 L.gui_arrang() # 主程序执行 tk.mainloop()