<<  < 2021 - >  >>
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30




maven 依赖

<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.29</version>
</dependency>


<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit-core-js</artifactId>
<version>2.28</version>
</dependency>
<dependency>
<groupId>org***oup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
</dependencies>

代码如下

private String url;
private String body;
private String UserName;
private String passWord;
public Crawler(String url, String body, String userName, String passWord) {
this.url = url;
this.body = body;
this.UserName = userName;
this.passWord = passWord;
}

public Crawler(String url, String body) {
this.url = url;
this.body=body;
}
public Elements Init() throws InterruptedException, IOException {

HtmlPage htmlPage=null;
WebClient client=new WebClient();
client.getOptions().setJavaEnabled(true);
client.getOptions().setCssEnabled(false);
client.getOptions().setThrowExceptionOnFailingStatusCode(false);
client.getOptions().setRedirectEnabled(true);

String url2= UrlUtils.encodeAnchor(url);

htmlPage = client.getPage(url2);
System.out.println(htmlPage.toString());
Thread.sleep(2000);
Document parse = Jsoup.parse(htmlPage.asXml());
Elements *** = parse.getElementsByClass(body);
return ***;
}
public List<String> getElements(Elements elements){
List<String> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val= element.text();
list.add(val);
}
return list;
}
public List<Integer> getInteger(Elements elements){
List<Integer> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val=element.text();
list.add(Integer.parseInt(val));
}
return list;
}
public List<Double> getDouble(Elements elements){
List<Double> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val=element.text();
list.add(Double.parseDouble(val));
}
return list;
}
public List<Date> getDate(Elements elements) throws ParseException {
List<Date> list=new ArrayList<>();
String val=null;
SimpleDateFormat smt = new SimpleDateFormat("yyyy-MM-dd");
for (Element element : elements) {
val= element.text();
Date date=smt.parse(val);
list.add(date);
}
return list;
}
public List<String> getAttr(Elements elements,String attr){
List<String> list=new ArrayList<>();
String val=null;
for (Element element : elements) {
val=element.attr(attr);
list.add(val);
}
return list;
}

  • 标签:java 
  • 发表评论:
    天涯博客欢迎您!