基本信息
源码名称:Java+Jsoup爬虫小红书微博B站
源码大小:5.48KB
文件格式:.zip
开发语言:Java
更新时间:2021-04-12
友情提示:(无需注册或充值,赞助后即可获取资源下载链接)
嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300
本次赞助数额为: 2 元×
微信扫码支付:2 元
×
请留下您的邮箱,我们将在2小时内将文件发到您的邮箱
源码介绍
package com.example.demo; import com.alibaba.fastjson.JSONObject; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; import org.springframework.boot.test.context.SpringBootTest; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @SpringBootTest class DemoApplicationTests { /** * Sam * 643003372@qq.com * 2020-06-05 11:42 * 详细说明:https://baidu.com */ @Test void main() throws Exception { System.out.println("***********小红书***********"); this.xiaohongshu(); System.out.println("***********微博***********"); this.weibo(); System.out.println("***********B站***********"); this.bilibili(); } @Test void xiaohongshu() throws Exception { Document doc = Jsoup.connect("https://www.xiaohongshu.com/discovery/item/5e92cdf70000000001009b42").get(); Elements like = doc.select(".operation-block .like span"); Elements comment = doc.select(".operation-block .comment span"); Elements star = doc.select(".operation-block .star span"); System.out.println("小红书点赞量:" like.get(0).html().toString()); System.out.println("小红书评论量:" comment.get(0).html().toString()); System.out.println("小红书收藏量:" star.get(0).html().toString()); } @Test void weibo() throws Exception { String tidResponse = this.get("https://passport.weibo.com/visitor/genvisitor?cb=gen_callback", null); // System.out.println("获取tid数据为:" tidResponse); tidResponse = tidResponse.replaceAll("window.gen_callback && gen_callback\\(", ""); tidResponse = tidResponse.replaceAll("\\);", ""); JSONObject tidDataJson = JSONObject.parseObject(tidResponse); JSONObject tidData = (JSONObject) tidDataJson.get("data"); String subAndSubsResponse = this.get("https://passport.weibo.com/visitor/visitor?a=incarnate&t=" URLEncoder.encode(tidData.get("tid").toString(), "UTF-8") "&w=" (Boolean.valueOf(tidData.get("new_tid").toString())?3:2) "&c=100&cb=cross_domain&from=weibo", null); // System.out.println("获取sub和subp数据为:" subAndSubsResponse); subAndSubsResponse = subAndSubsResponse.replaceAll("window.cross_domain && cross_domain\\(", ""); subAndSubsResponse = subAndSubsResponse.replaceAll("\\);", ""); JSONObject subAndSubsDataJson = JSONObject.parseObject(subAndSubsResponse); JSONObject subAndSubsData = (JSONObject) subAndSubsDataJson.get("data"); //获取微博HTML页面 String Cookie = "SUB=" URLEncoder.encode(subAndSubsData.get("sub").toString(), "UTF-8") "; SUBP=" URLEncoder.encode(subAndSubsData.get("subp").toString(), "UTF-8") ";"; Connection connect = Jsoup.connect("https://weibo.com/1234692083/Ixnp6nuPk"); connect.header("Cookie", Cookie); String html = ""; Matcher m = Pattern.compile("FM.view\\(.*\\)").matcher(connect.get().html()); while (m.find()){ String val = m.group(0); val = val.replaceAll("FM.view\\(", ""); val = val.replaceAll("\\)", ""); JSONObject json = JSONObject.parseObject(val); if("pl.content.weiboDetail.index".equals(json.get("ns"))){ html = json.get("html").toString(); } } // System.out.println(html); Document doc = Jsoup.parseBodyFragment(html); List<String> list = new ArrayList<>(); for (Element e : doc.select(".WB_row_line span.S_line1 em:nth-child(2)")) { list.add(e.html()); } System.out.println("微博转发量:" list.get(1)); System.out.println("微博评论量:" list.get(2)); System.out.println("微博点赞量:" list.get(3)); } @Test void bilibili() throws Exception { Document doc = Jsoup.connect("https://www.bilibili.com/video/BV1HE411b7nj").get(); Matcher m = Pattern.compile("window.__INITIAL_STATE__=\\{.*\\}};").matcher(doc.html()); String aid = ""; while (m.find()){ String val = m.group(0); val = val.replaceAll("window.__INITIAL_STATE__=\\{", "{"); val = val.replaceAll("\\}};", "}}"); JSONObject json = JSONObject.parseObject(val); aid = json.get("aid").toString(); } String response = this.get("https://api.bilibili.com/x/web-interface/archive/stat?aid=" aid, null); // System.out.println("根据aid 获取数据为:" response); JSONObject json = JSONObject.parseObject(response); JSONObject data = (JSONObject) json.get("data"); System.out.println("B站点赞量:" data.get("like")); System.out.println("B站投币量:" data.get("coin")); System.out.println("B站收藏量:" data.get("favorite")); System.out.println("B站转发量:" data.get("share")); } public String get(String url, String cookie) throws Exception { CloseableHttpClient client = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(5000).setSocketTimeout(5000).build(); httpGet.setConfig(requestConfig); httpGet.setHeader("Cookie", cookie); CloseableHttpResponse response = client.execute(httpGet); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { HttpEntity resEntity = response.getEntity(); return EntityUtils.toString(resEntity); } return ""; } }