嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 1 元微信扫码支付:1 元
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
通过爬虫爬取黑马论坛“Python 人工智能技术交流”版块下的帖子信息。仅供学习参考使用。
生成的文件json文件内容:
import requests
import json
from lxml import etree
def load_page(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
request = requests.get(url, headers=headers)
html = request.text
return html
def parse_html(html):
text = etree.HTML(html)
items = [] # 以列表的方式保存所有内容信息
nodelist = text.xpath("//*[@id='threadlisttableid']/tbody")
for node in nodelist:
# 文章标题
title = node.xpath("./tr/th/a[1]/text()")
# 文章链接
title_url = node.xpath("./tr/th/a[1]/@href")
# 发帖人
author = node.xpath("./tr/th/div[2]/i[1]/a/span/text()")
# 时间
date = node.xpath("./tr/th/div[2]/i[1]/span[1]/text()")
# 把一条记录放到一起,json字符串
item = {
"文章标题": title,
"文章链接": title_url,
"发帖人": author,
"时间": date
}
items.append(item)
return items
def save_files(items):
with open("heimabbs.json", "w ", encoding="utf-8") as f:
f.write(json.dumps(items, ensure_ascii=False))
def page_ctl(start_page, end_page):
data_all = []
for page in range(start_page, end_page 1):
url = f'http://bbs.itheima.com/forum-425-{page}.html'
print("Loading page:" str(page))
html = load_page(url)
data = parse_html(html)
data_all = data_all data
save_files(data_all)
if __name__ == '__main__':
start = input("请输入起始页:")
end = input("请输入结束页:")
page_ctl(int(start), int(end))