1. maven的pom.xml文件中引入Jsoup的sdk
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
2. 直接写具体的爬虫方法
Controller层
/**
* 插件导入
* https://api.oioweb.cn 该网站
*/
@PostMapping(value = "/importPlugins")
@ApiOperation(value = "插件导入", notes = "获取插件的详细请求信息")
public R<Map<String, Object>> importPlugins(@RequestBody CrawlerDto crawlerDto) {
return R.ok(wdPluginsService.importPlugins(crawlerDto.getUrl()));
}
Service层
Map<String, Object> importPlugins(String url);
ServiceImpl实现类
@Override
public Map<String, Object> importPlugins(String url){
Map<String, Object> returnMap = new HashMap<>();
// 教书先生api
if(url.contains("api.oioweb.cn/doc")){
returnMap = getUrlInfoByTeachMan(url);
}
return returnMap;
}
private Map<String, Object> getUrlInfoByTeachMan(String url){
Map<String, Object> returnMap = new HashMap<>();
try {
Document document = Jsoup.connect(url).get();
// 获取表头基础信息
Elements fieldset = document.select("fieldset");
Elements blockquote = document.select("blockquote");
returnMap.put("title", fieldset.get(0).text());
returnMap.put("describe", blockquote.get(0).text());
// 定位到class为"layui-tab-item layui-show"的div元素
Elements divElements = document.select("div.layui-tab-item.layui-show");// 根据实际情况修改选择器
if (divElements != null) {
Elements pElements = divElements.select("p.simpleTable");
String faceUrl = pElements.get(0).text();
String returnFomat = pElements.get(1).text();
String method = pElements.get(2).text();
String reExample = pElements.get(3).text();
returnMap.put("faceUrl", faceUrl.substring(faceUrl.indexOf(":", 1) + 1));
returnMap.put("returnFomat", returnFomat.substring(returnFomat.indexOf(": ") + 1));
returnMap.put("method", method.substring(method.indexOf(": ") + 1));
returnMap.put("reExample", reExample.substring(reExample.indexOf(":", 1) + 1));
// 获取该div元素下的所有子元素
for (Element div : divElements) {
// 这里可以添加您想要对每个div执行的操作
Elements tableElements = div.select("table");
Elements thead = tableElements.select("thead");
// 请求参数头
Elements headThs = thead.get(0).select("th");
List<String> requestHeaders = new ArrayList<>();
for (Element th : headThs) {
requestHeaders.add(th.text());
}
// 参数头
Elements bodyThs = thead.get(1).select("th");
List<String> responseHeaders = new ArrayList<>();
for (Element th : bodyThs) {
responseHeaders.add(th.text());
}
Elements tbody = tableElements.select("tbody");
Elements trs = tbody.select("tr");
List<Map<String, String>> reqList = new ArrayList<>();
List<Map<String, String>> resList = new ArrayList<>();
for (Element tr : trs) {
Elements tds = tr.select("td");
if (tds.size() == 4) { // 请求参数处理
Map<String, String> reqData = new HashMap<>();
for (int i = 0; i < requestHeaders.size(); i++) {
reqData.put(requestHeaders.get(i), tds.get(i).text());
}
reqList.add(reqData);
} else { // size()为3则是返回参数处理
Map<String, String> resData = new HashMap<>();
for (int i = 0; i < responseHeaders.size(); i++) {
resData.put(responseHeaders.get(i), tds.get(i).text());
}
resList.add(resData);
}
}
returnMap.put("requestList", reqList);
returnMap.put("responseList", resList);
System.out.println(returnMap);
}
}
}catch (IOException e){
e.printStackTrace();
}
return returnMap;
}
3. 总结 :Java爬虫是根据某个网站的固定同一个页面批量爬取想获取的相关信息
如有建议,欢迎指教!
本站资源均来自互联网,仅供研究学习,禁止违法使用和商用,产生法律纠纷本站概不负责!如果侵犯了您的权益请与我们联系!
转载请注明出处: 免费源码网-免费的源码资源网站 » Java爬虫之Jsoup爬Html网页链接
发表评论 取消回复