日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問(wèn) 生活随笔!

生活随笔

當(dāng)前位置: 首頁(yè) > 编程资源 > 编程问答 >内容正文

编程问答

java电子报刊网站_采集电子报纸 - 杨尚川的个人页面 - OSCHINA - 中文开源技术交流社区...

發(fā)布時(shí)間:2023/12/14 编程问答 32 豆豆
生活随笔 收集整理的這篇文章主要介紹了 java电子报刊网站_采集电子报纸 - 杨尚川的个人页面 - OSCHINA - 中文开源技术交流社区... 小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

1、接口

/**

*報(bào)紙采集器

*?@author?楊尚川

*/

public?interface?PaperCollector?{

/**

*?下載當(dāng)日?qǐng)?bào)紙,一個(gè)文件對(duì)應(yīng)一個(gè)版面

*?@return?報(bào)紙

*/

List?collect();

/**

*?下載指定日期的報(bào)紙,一個(gè)文件對(duì)應(yīng)一個(gè)版面

*?@param?date?指定日期

*?@return?報(bào)紙

*/

List?collect(Date?date);

}

2、抽象類

/**

*報(bào)紙采集器抽象類,通用采集功能實(shí)現(xiàn)

*?@author?楊尚川

*/

public?abstract?class?AbstractPaperCollector?implements?PaperCollector{

protected?final?Logger?LOG?=?LoggerFactory.getLogger(getClass());

@Override

public?List?collect()?{

return?collect(new?Date());

}

/**

*?根據(jù)下載鏈接提取文件夾名稱

*?@param?href?下載鏈接

*?@return?文件夾名稱

*/

protected?abstract?String?getPath(String?href);

/**

*?根據(jù)下載鏈接提取文件名稱

*?@param?href?下載鏈接

*?@return?文件名稱

*/

protected?abstract?String?getFile(String?href);

protected?List?downloadPaper(List?hrefs){

final?List?files?=?new?ArrayList<>();

List?ts?=?new?ArrayList<>();

LOG.info("報(bào)紙有"+hrefs.size()+"個(gè)版面需要下載:");

for(final?String?href?:?hrefs){

Thread?t?=?new?Thread(new?Runnable(){

@Override

public?void?run()?{

File?file?=?downloadPaper(href);

if(file?!=?null){

files.add(file);

}

}

});

t.start();

ts.add(t);

}

for(Thread?t?:?ts){

try?{

t.join();

}?catch?(InterruptedException?ex)?{

LOG.error("下載報(bào)紙出錯(cuò):",ex);

}

}

return?files;

}

protected?File?downloadPaper(String?href){

try{

LOG.info("下載報(bào)紙:"+href);

String?path?=?getPath(href);

LOG.debug("報(bào)紙保存目錄:"+path);

String?file?=?getFile(href);

LOG.debug("報(bào)紙保存文件:"+file);

File?dir?=?new?File(path);

if(!dir.exists()){

LOG.debug("創(chuàng)建目錄:"+dir.getAbsolutePath());

dir.mkdirs();

}

File?absoluteFile?=?new?File(path,?file);

LOG.debug("報(bào)紙保存絕對(duì)路徑:"+absoluteFile.getAbsolutePath());

Tools.copyFile(new?URL(href).openStream(),?absoluteFile);

LOG.info("報(bào)紙下載成功:"+href);

LOG.info("報(bào)紙成功保存到:"+absoluteFile.getAbsolutePath());

return?absoluteFile;

}catch(IOException?e){

LOG.error("報(bào)紙下載失敗:"+e);

}

return?null;

}

protected?void?run()?{

//今天

List?files?=?collect();

int?i?=?1;

for(File?file?:?files){

LOG.info((i++)+"?:?"?+?file.getAbsolutePath());

}

//昨天

Date?date?=?new?Date();

date.setTime(System.currentTimeMillis()-24*3600*1000);

files?=?collect(date);

i?=?1;

for(File?file?:?files){

LOG.info((i++)+"?:?"?+?file.getAbsolutePath());

}

//前天

date?=?new?Date();

date.setTime(System.currentTimeMillis()-2*24*3600*1000);

files?=?collect(date);

i?=?1;

for(File?file?:?files){

LOG.info((i++)+"?:?"?+?file.getAbsolutePath());

}

}

}

3、采集新華日?qǐng)?bào)

/**

*?新華日?qǐng)?bào)

*?@author?楊尚川

*/

public?class?XHRBPaperCollector?extends?AbstractPaperCollector{

private?static?final?String?paperName?=?"新華日?qǐng)?bào)";

private?static?final?String?paperPath?=?"http://xh.xhby.net/newxh/";

private?static?final?String?url?=?paperPath+"html/";

private?static?final?String?hrefPrefix?=?paperPath+"page/1/";

private?static?final?String?start?=?"node_2.htm";

private?static?final?String?pdfCssQuery?=?"html?body?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?div?table?tbody?tr?td?a";

private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");

@Override

public?List?collect(Date?date)?{

List?hrefs?=?new?ArrayList<>();

try?{

LOG.debug("url:?"+url);

String?paper?=?url?+?sf.format(date)?+?start;

LOG.debug("paper:?"+paper);

Document?document?=?Jsoup.connect(paper).get();

LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);

Elements?elements?=?document.select(pdfCssQuery);

for(Element?element?:?elements){

String?href?=?element.attr("href");

if(href?!=?null?&&?href.endsWith(".pdf")){

LOG.debug("報(bào)紙鏈接:"+href);

href?=?href.replace("../../../",?"");

LOG.debug("報(bào)紙鏈接:"+href);

hrefs.add(paperPath+href);

}else{

LOG.debug("不是報(bào)紙鏈接:"+href);

}

}

}?catch?(IOException?ex)?{

LOG.error("采集出錯(cuò)",ex);

}

return?downloadPaper(hrefs);

}

@Override

protected?String?getPath(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

attrs?=?attrs[0].split("-");

StringBuilder?str?=?new?StringBuilder();

str.append(paperName)

.append(File.separator)

.append(attrs[0])

.append("-")

.append(attrs[1])

.append(File.separator)

.append(attrs[2]);

return?str.toString();

}

@Override

protected?String?getFile(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

String?file?=?attrs[1]+".pdf";

return?file;

}

public?static?void?main(String[]?args)?{

new?XHRBPaperCollector().run();

}

}

4、采集楚天都市報(bào)

/**

*?楚天都市報(bào)

*?@author?楊尚川

*/

public?class?CTDSBPaperCollector?extends?AbstractPaperCollector{

private?static?final?String?paperName?=?"楚天都市報(bào)";

private?static?final?String?host?=?"http://ctdsb.cnhubei.com/";

private?static?final?String?paperPath?=?host+"ctdsb/";

private?static?final?String?url?=?host+"html/ctdsb/";

private?static?final?String?hrefPrefix?=?paperPath;

private?static?final?String?start?=?"index.html";

private?static?final?String?pdfCssQuery?=?"html?body?center?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?div?table?tbody?tr?td.info3?a";

private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyyMMdd/");

@Override

public?List?collect(Date?date)?{

List?hrefs?=?new?ArrayList<>();

try?{

LOG.debug("url:?"+url);

String?paper?=?url?+?sf.format(date)?+?start;

LOG.debug("paper:?"+paper);

Document?document?=?Jsoup.connect(paper).get();

LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);

Elements?elements?=?document.select(pdfCssQuery);

int?count=0;

for(Element?element?:?elements){

String?text?=?element.text();

if(text?!=?null?&&?text.startsWith("第")){

LOG.debug("報(bào)紙文本:"+text);

count++;

}else{

LOG.debug("不是報(bào)紙文本:"+text);

}

}

//有的版面缺失,而文件名是順序遞增的

for(int?i=1;?i<=count;?i++){

String?seq?=?Integer.toString(i);

if(i<10){

seq="0"+seq;

}

hrefs.add(paperPath?+?sf.format(date)?+?"page_"+seq+".jpg");

}

}?catch?(IOException?ex)?{

LOG.error("采集出錯(cuò)",ex);

}

return?downloadPaper(hrefs);

}

@Override

protected?String?getPath(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

StringBuilder?str?=?new?StringBuilder();

str.append(paperName)

.append(File.separator)

.append(attrs[0].substring(0,?4))

.append("-")

.append(attrs[0].substring(4,?6))

.append(File.separator)

.append(attrs[0].substring(6,?8));

return?str.toString();

}

@Override

protected?String?getFile(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

String?file?=?attrs[1].split("_")[1];

return?file;

}

public?static?void?main(String[]?args)?{

new?CTDSBPaperCollector().run();

}

}

5、采集京九晚報(bào)

/**

*?京九晚報(bào)

*?@author?楊尚川

*/

public?class?JJWBPaperCollector?extends?AbstractPaperCollector{

private?static?final?String?paperName?=?"京九晚報(bào)";

private?static?final?String?paperPath?=?"http://epaper.cnsq.com.cn/jjwb/";

private?static?final?String?url?=?paperPath+"html/";

private?static?final?String?hrefPrefix?=?paperPath+"page/10/";

private?static?final?String?start?=?"node_11.htm";

private?static?final?String?pdfCssQuery?=?"html?body?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?table?tbody?tr?td?div?table?tbody?tr?td?a";

private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");

@Override

public?List?collect(Date?date)?{

List?hrefs?=?new?ArrayList<>();

try?{

LOG.debug("url:?"+url);

String?paper?=?url?+?sf.format(date)?+?start;

LOG.debug("paper:?"+paper);

Document?document?=?Jsoup.connect(paper).get();

LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);

Elements?elements?=?document.select(pdfCssQuery);

for(Element?element?:?elements){

String?href?=?element.attr("href");

if(href?!=?null?&&?href.endsWith(".pdf")){

LOG.debug("報(bào)紙鏈接:"+href);

href?=?href.replace("../../../",?"");

LOG.debug("報(bào)紙鏈接:"+href);

hrefs.add(paperPath+href);

}else{

LOG.debug("不是報(bào)紙鏈接:"+href);

}

}

}?catch?(IOException?ex)?{

LOG.error("采集出錯(cuò)",ex);

}

return?downloadPaper(hrefs);

}

@Override

protected?String?getPath(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

StringBuilder?str?=?new?StringBuilder();

str.append(paperName)

.append(File.separator)

.append(attrs[0])

.append(File.separator)

.append(attrs[1]);

return?str.toString();

}

@Override

protected?String?getFile(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

String?file?=?attrs[2]+".pdf";

return?file;

}

public?static?void?main(String[]?args)?{

new?JJWBPaperCollector().run();

}

}

6、采集信息時(shí)報(bào)

/**

*?信息時(shí)報(bào)

*?@author?楊尚川

*/

public?class?XXSBPaperCollector?extends?AbstractPaperCollector{

private?static?final?String?paperName?=?"信息時(shí)報(bào)";

private?static?final?String?host?=?"http://informationtimes.dayoo.com/";

private?static?final?String?paperPath?=?host+"page/1019/";

private?static?final?String?url?=?host+"html/";

private?static?final?String?hrefPrefix?=?paperPath;

private?static?final?String?start?=?"node_1019.htm";

private?static?final?String?pdfCssQuery?=?"html?body#content?div.container?div.leftcolumn?div.leftcolumncontent?div.pagebuttontwo?div.con?p.right?span.dfive?a";

private?static?final?String?subCssQuery?=?"html?body#listcontent?div.container?div.rightcolumn?div.subcbga?div.listcontent?div#all_article_list.list?h4?span.left?a";

private?static?final?String?contentCssQuery?=?"html?body?div.container?div.leftcolumn?div.tbga?div.bbga?div.cbga?div.left?div.pagepicture?div?map?area";

private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");

@Override

public?List?collect(Date?date)?{

List?hrefs?=?new?ArrayList<>();

try?{

LOG.debug("url:?"+url);

String?paper?=?url?+?sf.format(date)?+?start;

LOG.debug("paper:?"+paper);

Document?document?=?Jsoup.connect(paper).get();

//1、找到子報(bào)紙

LOG.debug("subCssQuery:?"?+?subCssQuery);

Elements?elements?=?document.select(subCssQuery);

for(Element?element?:?elements){

String?text?=?element.text();

String?href?=?element.attr("href");

if(text?!=?null?&&?text.contains(":")?&&?href?!=?null?&&?href.endsWith(".htm")){

String?subPaperURL?=?url?+?sf.format(date)?+?href;

LOG.debug("子報(bào)紙文本:"+text+"?,?"+href);

LOG.debug("subPaperURL:"+subPaperURL);

//2、找到內(nèi)容頁(yè)面

LOG.debug("contentCssQuery:?"?+?contentCssQuery);

Elements?contentElements?=?Jsoup.connect(subPaperURL).get().select(contentCssQuery);

for(Element?contentElement?:?contentElements){

String?h?=?contentElement.attr("href");

if(h?!=?null?&&?h.startsWith("content_")?&&?h.endsWith(".htm")){

String?contentURL?=?url?+?sf.format(date)?+?h;

LOG.debug("contentURL:"+contentURL);

//3、找PDF

LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);

Elements?pdfElements?=?Jsoup.connect(contentURL).get().select(pdfCssQuery);

for(Element?pdfElement?:?pdfElements){

String?pdf?=?pdfElement.attr("href");

if(pdf?!=?null?&&?pdf.endsWith(".pdf")){

LOG.debug("報(bào)紙鏈接:"+pdf);

pdf?=?pdf.replace("../../../",?"");

LOG.debug("報(bào)紙鏈接:"+pdf);

hrefs.add(host+pdf);

}else{

LOG.debug("不是報(bào)紙鏈接:"+pdf);

}

}

//有多個(gè)content,選擇一個(gè)即可

break;

}

}

}else{

LOG.debug("不是子報(bào)紙文本:"+text+"?,?"+href);

}

}

}?catch?(IOException?ex)?{

LOG.error("采集出錯(cuò)",ex);

}

return?downloadPaper(hrefs);

}

@Override

protected?String?getPath(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

StringBuilder?str?=?new?StringBuilder();

str.append(paperName)

.append(File.separator)

.append(attrs[0])

.append(File.separator)

.append(attrs[1]);

return?str.toString();

}

@Override

protected?String?getFile(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

String?file?=?attrs[2]+".pdf";

return?file;

}

public?static?void?main(String[]?args)?{

new?XXSBPaperCollector().run();

}

}

7、采集羊城晚報(bào)

/**

*?羊城晚報(bào)

*?@author?楊尚川

*/

public?class?YCWBPaperCollector?extends?AbstractPaperCollector{

private?static?final?String?paperName?=?"羊城晚報(bào)";

private?static?final?String?paperPath?=?"http://www.ycwb.com/ePaper/ycwb/";

private?static?final?String?url?=?paperPath+"html/";

private?static?final?String?hrefPrefix?=?paperPath+"images/";

private?static?final?String?start?=?"node_2081.htm";

private?static?final?String?pdfCssQuery?=?"html?body?div.cbody?div.areaL?div.box?div.conBox2?div?div.xx?h2?em?a.px12";

private?static?final?SimpleDateFormat?sf?=?new?SimpleDateFormat("yyyy-MM/dd/");

@Override

public?List?collect(Date?date)?{

List?hrefs?=?new?ArrayList<>();

try?{

LOG.debug("url:?"+url);

String?paper?=?url?+?sf.format(date)?+?start;

LOG.debug("paper:?"+paper);

Document?document?=?Jsoup.connect(paper).get();

LOG.debug("pdfCssQuery:?"?+?pdfCssQuery);

Elements?elements?=?document.select(pdfCssQuery);

for(Element?element?:?elements){

String?href?=?element.attr("href");

if(href?!=?null?&&?href.endsWith(".pdf")){

LOG.debug("報(bào)紙鏈接:"+href);

href?=?href.replace("../../../",?"");

LOG.debug("報(bào)紙鏈接:"+href);

hrefs.add(paperPath+href);

}else{

LOG.debug("不是報(bào)紙鏈接:"+href);

}

}

}?catch?(IOException?ex)?{

LOG.error("采集出錯(cuò)",ex);

}

return?downloadPaper(hrefs);

}

@Override

protected?String?getPath(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

StringBuilder?str?=?new?StringBuilder();

str.append(paperName)

.append(File.separator)

.append(attrs[0])

.append(File.separator)

.append(attrs[1]);

return?str.toString();

}

@Override

protected?String?getFile(String?href)?{

String?path?=?href.replace(hrefPrefix,?"");

String[]?attrs?=?path.split("/");

String?file?=?attrs[2]+".pdf";

return?file;

}

public?static?void?main(String[]?args)?{

new?YCWBPaperCollector().run();

}

}

總結(jié)

以上是生活随笔為你收集整理的java电子报刊网站_采集电子报纸 - 杨尚川的个人页面 - OSCHINA - 中文开源技术交流社区...的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。