日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程语言 > java >内容正文

java

Java正则表达式获取网页所有网址和链接文字

發布時間:2023/12/4 java 32 豆豆
生活随笔 收集整理的這篇文章主要介紹了 Java正则表达式获取网页所有网址和链接文字 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

/*獲取網址首頁的所有網址和鏈接文字*/


import?java.io.BufferedReader;
import?java.io.IOException;
import?java.io.InputStreamReader;
import?java.net.MalformedURLException;
import?java.net.URL;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.List;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;



import?java.net.*;
import?java.io.*;
import?java.util.regex.*;

/*
根據指定的規則,通過構造正則表達式獲取網址
*/


public?class?Urls
...{
????
private?String?startUrl;?????????????????????????????????????????//開始采集網址
????String??urlContent;
????String?ContentArea;
????
private?String?strAreaBegin?,strAreaEnd?;????????????//采集區域開始采集字符串和結束采集字符串
????private?String?stringInUrl,stringNotInUrl;????????
????String?strContent;
//獲得的采集內容
????String[]?allUrls;????????????????????????????????????????????????????????????//采集到的所有網址
????private?String??regex;?????????????????????????????????????????????????//采集規則
????
????UrlAndTitle???urlAndTitle
=new?UrlAndTitle();????//存儲網址和標題????????????????????
????
????
????
public?static?void?main(String[]?args)
????
...{
?????????Urls?myurl
=new?Urls("<body","/body>");
?????????myurl.getStartUrl("http://www.zuzwn.com/");

?????????myurl.getUrlContent();
?????????myurl.getContentArea();
?????????myurl.getStartUrl("http://www.zuzwn.com/");

?????????myurl.getStringNotInUrl(
"google");
?????????myurl.Urls();
?????????
????????
//System.out.println("startUrl:"+myurl.startUrl);
????????
//System.out.println("urlcontent:"+myurl.urlContent);
????????
//System.out.println("ContentArea:"+myurl.ContentArea);
?
????}

????
????
????
//初始化構造函數?strAreaBegin?和strAreaEnd
?
????
public?Urls?(String?strAreaBegin,String?strAreaEnd)
????
...{
????????
this.strAreaBegin=strAreaBegin;
????????
this.strAreaEnd=strAreaEnd;
??}

??
??
//
????public?void?Urls()
????
...{
????????
int?i=0;
????????
//String?regex?="<a?href="?'?http://[a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
????????String?regex?="<a.*?/a>";
?????????
//String?regex?="http://.*?>";
????????Pattern?pt=Pattern.compile(regex);
????????Matcher?mt
=pt.matcher(ContentArea);
????????
while(mt.find())
?????????
...{
?????????????????System.out.println(mt.group());
?????????????????i
++;
?????????????????
?????????????????
//獲取標題
?????????????????Matcher?title=Pattern.compile(">.*?</a>").matcher(mt.group());?
?????????????????
while(title.find())
?????????????????
...{
??????????????????????System.out.println(
"標題:"+title.group().replaceAll(">|</a>",""));
?????????????????}

?????????????????
?????????????????
//獲取網址
?????????????????Matcher?myurl=Pattern.compile("href=.*?>").matcher(mt.group());?
?????????????????
while(myurl.find())
?????????????????
...{
??????????????????????System.out.println(
"網址:"+myurl.group().replaceAll("href=|>",""));
?????????????????}

?????????????????
?????????????????System.out.println();
?????????????????
?????????????????
?????????}

??????
????????System.out.println(
"共有"+i+"個符合結果");
????????
????}
????
?
????
????
//獲得開始采集網址
????public?void?getStartUrl(String?startUrl)
????
...{
????????
this.startUrl=startUrl;
????}

????
????
//獲得網址所在內容;
????public?void?getUrlContent()
????
...{
????????
????????StringBuffer?is
=new?StringBuffer();
????????
try
????????
...{
????????????URL?myUrl
=new?URL(startUrl);
????????????BufferedReader?br
=?new?BufferedReader(
????????????????????????????????????????????????????????
new?InputStreamReader(myUrl.openStream()));
????????????????????????????????????????????????????????????
????????????String?s;????????????????????????????????????????????????
????????????
while((s=br.readLine())!=null)
????????????
...{
????????????????is.append(s);
????????????}
????????????????????????????????????????????
????????????urlContent
=is.toString();
????????}

????
catch(Exception?e)
????
????
...{?
????????System.out.println(
"網址文件未能輸出");
????????e.printStackTrace();
????}

????????
????????
????}

?????
????
????
//獲得網址所在的匹配區域部分
????public?void?getContentArea()
????
...{
?????????
int?pos1=0,pos2=0;
?????????pos1
=?urlContent.indexOf(strAreaBegin)+strAreaBegin.length();
?????????pos2
=urlContent.indexOf(strAreaEnd,pos1);
?????????ContentArea
=urlContent.substring(pos1,pos2);?
????}

????
????
//以下兩個函數獲得網址應該要包含的關鍵字及不能包含的關鍵字
????
//這里只做初步的實驗。后期,保護的關鍵字及不能包含的關鍵字應該是不只一個的。
????public?void?getStringInUrl(String?stringInUrl)
????
...{
?????????
this.stringInUrl=stringInUrl;????????
??????????
????}

????
????
public?void?getStringNotInUrl(String?stringNotInUrl)
????
...{
????????
this.stringNotInUrl=stringNotInUrl;
????}

????
????
//獲取采集規則
????
????
//獲取url網址
????public?void?getUrl()
????
...{
?????
????}

????
????
public?String?getRegex()
????
...{
????????
return?regex;
????????
????}

????
????
class?UrlAndTitle
????
...{
????????String?myURL;
????????String?title;
????}

}

轉載于:https://www.cnblogs.com/zuzwn/p/3614978.html

總結

以上是生活随笔為你收集整理的Java正则表达式获取网页所有网址和链接文字的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。