有时候表格比较复杂 有caption header body footer
htmlunit同样提供了api支持:
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>复杂表格</title> </head> <body> <table id="table1"> <caption>复杂表格</caption> <thead> <tr> <th>个数</th> <th>名称</th> </tr> </thead> <tfoot> <tr> <td>7</td> <td></td> </tr> </tfoot> <tbody> <tr> <td>5</td> <td>猪</td> </tr> </tbody> <tbody> <tr> <td>2</td> <td>牛</td> </tr> </tbody> </table> </body> </html>
这个表格比前面一个复杂点:
测试地址:http://www.java1234.com/crawler/table02.html
我们给下测试代码:
package com.open1111; import java.io.IOException; import java.net.MalformedURLException; import java.util.List; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlTable; import com.gargoylesoftware.htmlunit.html.HtmlTableBody; import com.gargoylesoftware.htmlunit.html.HtmlTableFooter; import com.gargoylesoftware.htmlunit.html.HtmlTableHeader; import com.gargoylesoftware.htmlunit.html.HtmlTableRow; public class HtmlUnitTest6 { public static void main(String[] args) { WebClient webClient=new WebClient(BrowserVersion.FIREFOX_52); // 实例化Web客户端 webClient.getOptions().setCssEnabled(false); // 取消css支持 webClient.getOptions().setJavaScriptEnabled(false); // 取消javascript支持 try { HtmlPage page=webClient.getPage("http://www.java1234.com/crawler/table02.html"); // 解析获取页面 HtmlTable table=page.getHtmlElementById("table1"); String caption=table.getCaptionText(); // 获取表格标题 System.out.println("表格标题:"+caption); HtmlTableHeader header=table.getHeader(); // 获取表头信息 List<HtmlTableRow> headerRows=header.getRows(); // 获取所有头行 System.out.println("头信息:"); for(HtmlTableRow row:headerRows){ System.out.println(row.asText()); } for(HtmlTableBody body:table.getBodies()){ // 获取表格内容信息 List<HtmlTableRow> rows=body.getRows(); for(HtmlTableRow row:rows){ System.out.println(row.asText()); } } HtmlTableFooter footer=table.getFooter(); // 获取根信息 List<HtmlTableRow> footerRows=footer.getRows(); System.out.println("根信息:"); for(HtmlTableRow row:footerRows){ System.out.println(row.asText()); } } catch (FailingHttpStatusCodeException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ webClient.close(); // 关闭客户端,释放内存 } } }
运行输出:
表格标题:复杂表格
头信息:
个数 名称
5 猪
2 牛
根信息:
7