htmlparser解析网页内容代码
生活随笔
收集整理的這篇文章主要介紹了
htmlparser解析网页内容代码
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
/*抽取html網頁文本,循環到值節點并判斷是否有src鏈接*/public void htmlText(String url) throws Exception{try {URL u = new URL(url);if("https".equalsIgnoreCase(u.getProtocol())){SslUtils.ignoreSsl();}//生成一個解析器對象,用網頁的 url 作為參數Parser parser = new Parser(url);if(parser.getEncoding().equals("ISO-8859-1")) parser.setEncoding("UTF-8");//迭代所有節點, null 表示不使用 NodeFilterNodeList list = parser.parse(null);//循環值節點并輸出processNodeList(list); //輸出大節點/*SimpleNodeIterator iterator = list.elements();while (iterator.hasMoreNodes()) {Node node = iterator.nextNode();String result = node.toHtml();System.out.println(result); }*/}catch (ParserException e) {e.printStackTrace();}}private void processNodeList(NodeList list) {//迭代開始SimpleNodeIterator iterator = list.elements();while (iterator.hasMoreNodes()) {Node node = iterator.nextNode(); //得到該節點的子節點列表NodeList childList = node.getChildren();//孩子節點為空,說明是值節點if (null == childList){//得到值節點的值String txt = node.toHtml();System.out.println(txt);if(txt.contains("src=")){int start = txt.indexOf("src=");txt=txt.substring(start);int end = txt.indexOf(" ");if (end == -1) end = txt.indexOf(">");String src = txt.substring(5, end - 1);System.out.println(src);} }else processNodeList(childList);//孩子節點不為空,繼續迭代該孩子節點}}
/** 在htmlparser中,Node分成三類,都繼承AbstractNode:* RemarkNode:代表Html中的注釋* TagNode:標簽節點。* TextNode:文本節點*/public void tagGet(String url){try {Parser parser = new Parser(url);if(parser.getEncoding().equals("ISO-8859-1")) parser.setEncoding("UTF-8");NodeVisitor visitor = new NodeVisitor( true, true ) {public void visitTag(Tag tag) {System.out.println("This is Tag:"+tag.getText());}public void visitStringNode (Text string) {System.out.println("This is Text:"+string);}public void visitRemarkNode (Remark remark) {System.out.println("This is Remark:"+remark.getText());}public void beginParsing () {System.out.println("beginParsing");}public void visitEndTag (Tag tag){System.out.println("visitEndTag:"+tag.getText());}public void finishedParsing () {System.out.println("finishedParsing");}};parser.visitAllNodesWith(visitor);/*NodeVisitor visitor = new NodeVisitor() { public void visitTag(Tag tag) { System.out.println("正在訪問的tag:" + tag.getTagName() + " || Class is :"+ tag.getClass()); } }; parser.visitAllNodesWith(visitor);*///NodeList parse = parser.parse(null);//System.out.println(parse.toHtml());} catch (ParserException e) {e.printStackTrace();}}
/** 單獨獲取inputtag和formtag*/public void formGet(String url) throws ParserException{Parser parser = new Parser(url);if(parser.getEncoding().equals("ISO-8859-1")) parser.setEncoding("UTF-8");NodeFilter inputFilter = new NodeClassFilter(InputTag.class);NodeFilter formFilter = new NodeClassFilter(FormTag.class);NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);OrFilter lastFilter = new OrFilter();lastFilter.setPredicates(new NodeFilter[] { formFilter, inputFilter,frameFilter });NodeList nodeList = parser.parse(lastFilter);for(int i = 0; i <= nodeList.size(); i++){if(nodeList.elementAt(i) instanceof InputTag){InputTag tag = (InputTag) nodeList.elementAt(i);System.out.println("Input Info:" + tag.getTagName()+":"+tag.getText());}if(nodeList.elementAt(i) instanceof FormTag){FormTag tag = (FormTag) nodeList.elementAt(i);System.out.println("Form Info: " + tag.getFormName()+":"+tag.getText());NodeList inputnodes=tag.getFormInputs();for(int j=0;j<=inputnodes.size();j++){InputTag tag1 = (InputTag) inputnodes.elementAt(j);System.out.println("Input Info:" + tag1.getTagName()+":"+tag1.getText());} }if(nodeList.elementAt(i) instanceof FrameTag){FrameTag tag = (FrameTag) nodeList.elementAt(i);System.out.println("Frame Info: " + tag.getFrameName()+":"+tag.getText());}}}/** org.htmlparser Class PrototypicalNodeFactory* A node factory based on the prototype pattern. * This factory uses the prototype pattern to generate new nodes. * These are cloned as needed to form new Text, Remark and Tag nodes.*/public void hyperGet(String url) throws ParserException{PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();factory.registerTag(new FormTag());Parser parser = new Parser (url);parser.setNodeFactory (factory);NodeFilter filter = new NodeClassFilter(FormTag.class);NodeList nodeList = parser.parse(filter);for (Node node : nodeList.toNodeArray()) { if (node instanceof FormTag) { FormTag tag = (FormTag) node;System.out.println("Form Info: " + tag.getFormName()+":"+tag.getText());}}}
/** 在htmlparser中,Node分成三類,都繼承AbstractNode:* RemarkNode:代表Html中的注釋* TagNode:標簽節點。* TextNode:文本節點*/public void tagGet(String url){try {Parser parser = new Parser(url);if(parser.getEncoding().equals("ISO-8859-1")) parser.setEncoding("UTF-8");NodeVisitor visitor = new NodeVisitor( true, true ) {public void visitTag(Tag tag) {System.out.println("This is Tag:"+tag.getText());}public void visitStringNode (Text string) {System.out.println("This is Text:"+string);}public void visitRemarkNode (Remark remark) {System.out.println("This is Remark:"+remark.getText());}public void beginParsing () {System.out.println("beginParsing");}public void visitEndTag (Tag tag){System.out.println("visitEndTag:"+tag.getText());}public void finishedParsing () {System.out.println("finishedParsing");}};parser.visitAllNodesWith(visitor);/*NodeVisitor visitor = new NodeVisitor() { public void visitTag(Tag tag) { System.out.println("正在訪問的tag:" + tag.getTagName() + " || Class is :"+ tag.getClass()); } }; parser.visitAllNodesWith(visitor);*///NodeList parse = parser.parse(null);//System.out.println(parse.toHtml());} catch (ParserException e) {e.printStackTrace();}}
/** 單獨獲取inputtag和formtag*/public void formGet(String url) throws ParserException{Parser parser = new Parser(url);if(parser.getEncoding().equals("ISO-8859-1")) parser.setEncoding("UTF-8");NodeFilter inputFilter = new NodeClassFilter(InputTag.class);NodeFilter formFilter = new NodeClassFilter(FormTag.class);NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);OrFilter lastFilter = new OrFilter();lastFilter.setPredicates(new NodeFilter[] { formFilter, inputFilter,frameFilter });NodeList nodeList = parser.parse(lastFilter);for(int i = 0; i <= nodeList.size(); i++){if(nodeList.elementAt(i) instanceof InputTag){InputTag tag = (InputTag) nodeList.elementAt(i);System.out.println("Input Info:" + tag.getTagName()+":"+tag.getText());}if(nodeList.elementAt(i) instanceof FormTag){FormTag tag = (FormTag) nodeList.elementAt(i);System.out.println("Form Info: " + tag.getFormName()+":"+tag.getText());NodeList inputnodes=tag.getFormInputs();for(int j=0;j<=inputnodes.size();j++){InputTag tag1 = (InputTag) inputnodes.elementAt(j);System.out.println("Input Info:" + tag1.getTagName()+":"+tag1.getText());} }if(nodeList.elementAt(i) instanceof FrameTag){FrameTag tag = (FrameTag) nodeList.elementAt(i);System.out.println("Frame Info: " + tag.getFrameName()+":"+tag.getText());}}}/** org.htmlparser Class PrototypicalNodeFactory* A node factory based on the prototype pattern. * This factory uses the prototype pattern to generate new nodes. * These are cloned as needed to form new Text, Remark and Tag nodes.*/public void hyperGet(String url) throws ParserException{PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();factory.registerTag(new FormTag());Parser parser = new Parser (url);parser.setNodeFactory (factory);NodeFilter filter = new NodeClassFilter(FormTag.class);NodeList nodeList = parser.parse(filter);for (Node node : nodeList.toNodeArray()) { if (node instanceof FormTag) { FormTag tag = (FormTag) node;System.out.println("Form Info: " + tag.getFormName()+":"+tag.getText());}}}
總結
以上是生活随笔為你收集整理的htmlparser解析网页内容代码的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: htmlparser操作bean类提取h
- 下一篇: htmlparser新建tag类(以if