maven 依赖
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.29</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit-core-js</artifactId>
<version>2.28</version>
</dependency>
<dependency>
<groupId>org***oup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
</dependencies>
代码如下
private String url;
private String body;
private String UserName;
private String passWord;
public Crawler(String url, String body, String userName, String passWord) {
this.url = url;
this.body = body;
this.UserName = userName;
this.passWord = passWord;
}
public Crawler(String url, String body) {
this.url = url;
this.body=body;
}
public Elements Init() throws InterruptedException, IOException {
HtmlPage htmlPage=null;
WebClient client=new WebClient();
client.getOptions().setJavaEnabled(true);
client.getOptions().setCssEnabled(false);
client.getOptions().setThrowExceptionOnFailingStatusCode(false);
client.getOptions().setRedirectEnabled(true);
String url2= UrlUtils.encodeAnchor(url);
htmlPage = client.getPage(url2);
System.out.println(htmlPage.toString());
Thread.sleep(2000);
Document parse = Jsoup.parse(htmlPage.asXml());
Elements *** = parse.getElementsByClass(body);
return ***;
}
public List<String> getElements(Elements elements){
List<String> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val= element.text();
list.add(val);
}
return list;
}
public List<Integer> getInteger(Elements elements){
List<Integer> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val=element.text();
list.add(Integer.parseInt(val));
}
return list;
}
public List<Double> getDouble(Elements elements){
List<Double> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val=element.text();
list.add(Double.parseDouble(val));
}
return list;
}
public List<Date> getDate(Elements elements) throws ParseException {
List<Date> list=new ArrayList<>();
String val=null;
SimpleDateFormat smt = new SimpleDateFormat("yyyy-MM-dd");
for (Element element : elements) {
val= element.text();
Date date=smt.parse(val);
list.add(date);
}
return list;
}
public List<String> getAttr(Elements elements,String attr){
List<String> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val=element.attr(attr);
list.add(val);
}
return list;
}