基本信息
源码名称:Java+Jsoup爬虫小红书微博B站
源码大小:5.48KB
文件格式:.zip
开发语言:Java
更新时间:2021-04-12
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
package com.example.demo;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@SpringBootTest
class DemoApplicationTests {
/**
* Sam
* 643003372@qq.com
* 2020-06-05 11:42
* 详细说明:https://baidu.com
*/
@Test
void main() throws Exception {
System.out.println("***********小红书***********");
this.xiaohongshu();
System.out.println("***********微博***********");
this.weibo();
System.out.println("***********B站***********");
this.bilibili();
}
@Test
void xiaohongshu() throws Exception {
Document doc = Jsoup.connect("https://www.xiaohongshu.com/discovery/item/5e92cdf70000000001009b42").get();
Elements like = doc.select(".operation-block .like span");
Elements comment = doc.select(".operation-block .comment span");
Elements star = doc.select(".operation-block .star span");
System.out.println("小红书点赞量:" like.get(0).html().toString());
System.out.println("小红书评论量:" comment.get(0).html().toString());
System.out.println("小红书收藏量:" star.get(0).html().toString());
}
@Test
void weibo() throws Exception {
String tidResponse = this.get("https://passport.weibo.com/visitor/genvisitor?cb=gen_callback", null);
// System.out.println("获取tid数据为:" tidResponse);
tidResponse = tidResponse.replaceAll("window.gen_callback && gen_callback\\(", "");
tidResponse = tidResponse.replaceAll("\\);", "");
JSONObject tidDataJson = JSONObject.parseObject(tidResponse);
JSONObject tidData = (JSONObject) tidDataJson.get("data");
String subAndSubsResponse = this.get("https://passport.weibo.com/visitor/visitor?a=incarnate&t=" URLEncoder.encode(tidData.get("tid").toString(), "UTF-8") "&w=" (Boolean.valueOf(tidData.get("new_tid").toString())?3:2) "&c=100&cb=cross_domain&from=weibo", null);
// System.out.println("获取sub和subp数据为:" subAndSubsResponse);
subAndSubsResponse = subAndSubsResponse.replaceAll("window.cross_domain && cross_domain\\(", "");
subAndSubsResponse = subAndSubsResponse.replaceAll("\\);", "");
JSONObject subAndSubsDataJson = JSONObject.parseObject(subAndSubsResponse);
JSONObject subAndSubsData = (JSONObject) subAndSubsDataJson.get("data");
//获取微博HTML页面
String Cookie = "SUB=" URLEncoder.encode(subAndSubsData.get("sub").toString(), "UTF-8") "; SUBP=" URLEncoder.encode(subAndSubsData.get("subp").toString(), "UTF-8") ";";
Connection connect = Jsoup.connect("https://weibo.com/1234692083/Ixnp6nuPk");
connect.header("Cookie", Cookie);
String html = "";
Matcher m = Pattern.compile("FM.view\\(.*\\)").matcher(connect.get().html());
while (m.find()){
String val = m.group(0);
val = val.replaceAll("FM.view\\(", "");
val = val.replaceAll("\\)", "");
JSONObject json = JSONObject.parseObject(val);
if("pl.content.weiboDetail.index".equals(json.get("ns"))){
html = json.get("html").toString();
}
}
// System.out.println(html);
Document doc = Jsoup.parseBodyFragment(html);
List<String> list = new ArrayList<>();
for (Element e : doc.select(".WB_row_line span.S_line1 em:nth-child(2)")) {
list.add(e.html());
}
System.out.println("微博转发量:" list.get(1));
System.out.println("微博评论量:" list.get(2));
System.out.println("微博点赞量:" list.get(3));
}
@Test
void bilibili() throws Exception {
Document doc = Jsoup.connect("https://www.bilibili.com/video/BV1HE411b7nj").get();
Matcher m = Pattern.compile("window.__INITIAL_STATE__=\\{.*\\}};").matcher(doc.html());
String aid = "";
while (m.find()){
String val = m.group(0);
val = val.replaceAll("window.__INITIAL_STATE__=\\{", "{");
val = val.replaceAll("\\}};", "}}");
JSONObject json = JSONObject.parseObject(val);
aid = json.get("aid").toString();
}
String response = this.get("https://api.bilibili.com/x/web-interface/archive/stat?aid=" aid, null);
// System.out.println("根据aid 获取数据为:" response);
JSONObject json = JSONObject.parseObject(response);
JSONObject data = (JSONObject) json.get("data");
System.out.println("B站点赞量:" data.get("like"));
System.out.println("B站投币量:" data.get("coin"));
System.out.println("B站收藏量:" data.get("favorite"));
System.out.println("B站转发量:" data.get("share"));
}
public String get(String url, String cookie) throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(5000).setSocketTimeout(5000).build();
httpGet.setConfig(requestConfig);
httpGet.setHeader("Cookie", cookie);
CloseableHttpResponse response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
HttpEntity resEntity = response.getEntity();
return EntityUtils.toString(resEntity);
}
return "";
}
}