當前位置：首頁 > 人文社科 > 生活经验 >内容正文

生活经验

php作为文本进行处理,PHP处理文本和爬虫技巧

發布時間：2023/11/27 生活经验 31 豆豆

生活随笔收集整理的這篇文章主要介紹了 php作为文本进行处理,PHP处理文本和爬虫技巧小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

用 php 處理注意，explode()拆分字符串，返回給一個變量便是將拆分出的東西依次存進這個變量，返回給一個list(變量1，變量2，……)則是將拆分出的東西分別存到變量1，變量2，……。

結合 array_fliter() 過濾的時候，如果不給回調函數過濾不干凈的時候，可以在增加過濾回調函數，排除 "\r\n" 和原本的 FALSE(不寫過濾回調函數的時候默認就是這個FALSE)。

調試可用file_put_contents("/home/search/result", var_export(['time'=> date('Y-m-d H:i:s'),'res'=>$result], true)."\n", FILE_APPEND);

$file=fopen("mission2.txt","r") or exit("Unable to open file!");

$fileCity=fopen("city2.txt","r") or exit("Unable to open file!");

$arr = []; //用不用都行

// 讀取文件每一行，直到文件結尾

while(!feof($file))

{

//echo fgets($file). "
";

$arr[] = fgets($file);

}

while(!feof($fileCity))

{

$city[] = fgets($fileCity);

}

//print_r($arr);

foreach($arr as $a){

list($term[]) = explode(" ",$a);

}

echo '
';

print_r($term);

echo '
';

$city = array_filter($city);

print_r($city);

function gl($gg){

if($gg == "\r\n" or $gg == FALSE) //要用\r\n這個才能過濾掉某些windows里面的換行，還需要加上默認的FALSE，不然也可能濾不干凈

return 0;

else

return 1;

}

foreach($city as $c){

$cityAlone[] = array_filter(explode("\t",$c),"gl"); //array_filter不使用第二個參數回調函數的話默認是過濾FALSE

}

echo '
';

echo '

';

print_r($cityAlone);

echo '

echo '
';

print_r(array_filter($cityAlone[0]));

foreach($term as $key=>$te){

foreach(array_filter($cityAlone[$key]) as $ci){

echo $ci.$te.'
';

}

fclose($fileCity);

fclose($file);

ini_set('memory_limit', '512M');

$fileLeimu = fopen("leimu.txt","r") or exit("Unable to open file!");

$fileCity = fopen("city.txt","r") or exit("Unable to open file!");

$fileZaci1 = fopen("zaci1.txt","r") or exit("Unable to open file!");

while(!feof($fileLeimu))

{

$leimu[] = fgets($fileLeimu);

}

while(!feof($fileCity))

{

$city[] = fgets($fileCity);

}

while(!feof($fileZaci1))

{

$zaci1[] = fgets($fileZaci1);

}

function gl($gg){

if($gg == "\r\n" or $gg == FALSE) //要用\r\n這個才能過濾掉某些windows里面的換行，還需要加上默認的FALSE，不然也可能濾不干凈

return 0;

else

return 1;

}

$leimu = array_filter($leimu,"gl");

$city = array_filter($city,"gl");

$zaci1 = array_filter($zaci1,"gl");

$zaci2 = ['上門','找'];

$leimu = array_unique($leimu); //去重

$city = array_unique($city);

$leimu = str_replace(array("\r\n", "\r", "\n"), "", $leimu); //去掉每個字符串元素末尾的換行

$city = str_replace(array("\r\n", "\r", "\n"), "", $city);

$zaci1 = str_replace(array("\r\n", "\r", "\n"), "", $zaci1);

// echo '

';

// print_r($zaci2);

// print_r($leimu);

// print_r($city);

// print_r($zaci1);

// echo '

foreach($city as $c){

foreach($leimu as $l){

foreach($zaci1 as $zOne){

$combRes[] = $c.$l.$zOne;

}

foreach($city as $c){

foreach($leimu as $l){

foreach($zaci2 as $zTwo){

$combRes[] = $c.$zTwo.$l;

}

foreach($city as $c){

foreach($leimu as $l){

$combRes[] = $c.$l;

}

foreach($leimu as $l){

foreach($zaci1 as $zOne){

$combRes[] = $l.$zOne;

}

foreach($zaci2 as $zTwo){

foreach($leimu as $l){

$combRes[] = $zTwo.$l;

}

foreach($leimu as $l){

$combRes[] = $l;

}

$combRes = implode("\r\n", $combRes); //想要打印到txt中，能有換行的效果，需要添加這個

file_put_contents("456.txt",$combRes);

// echo gettype($combRes);

echo '

';

print_r($combRes);

echo '

fclose($fileLeimu);

fclose($fileCity);

fclose($fileZaci1);

獲取頁面某標簽中的內容，若是能借助下載simplehtmldom類打開操作的話，dom方便。若打不開，則用file_get_contents或者curl(網上說效率更高)，讀取全文內容，然后用正則匹配來做。

require_once "../classes/simplehtmldom_1_5/simple_html_dom.php";

$mainHtml = 'http://***/index.xml';

//$mainHtml ='http://***/20161222-0.xml'; //為何沒法用file_get_html打開？

$html = file_get_html($mainHtml); //創建一個DOM

foreach($html->find('loc') as $loc){

$locTextRecord[] = $loc->plaintext;

}

// $htmltest = file_get_contents($locTextRecord[4]);

foreach($locTextRecord as $everyLoc){

$htmltest = file_get_contents($everyLoc);

$reg = '/\(.*?)\/is';

if(preg_match_all($reg, $htmltest, $arr)) {

foreach($arr[1] as $a){

$record[] = $a;

}

} else {

echo "匹配失敗!
";

}

$record = array_unique($record);

foreach($record as $r){

var_dump($r); //利用 var_dump 來查看變量類型，可以調試和直接在網頁上復制用。

}

$html->clear();

一個主xml下含很多子標簽，在所有xml中，查找某個字符串：

$mainHtml = file_get_contents('http://***/tp_index.xml');

$reg = '/\(.*?)\/is';

if(preg_match_all($reg, $mainHtml, $arr)) {

foreach($arr[1] as $a){

$locTextRecord[] = $a;

}

} else {

echo "loc匹配失敗!
";

}

//var_dump($locTextRecord);

foreach($locTextRecord as $everyLoc){

$htmlTest = file_get_contents($everyLoc);

//$aim = 'https://baidu.com?srcid%3D1000%26id%3D10034_5467_%E5%8C%97%E4%BA%AC';

$aim = '搬家';

if(strpos($htmlTest, $aim)) {

$record[] = $everyLoc;

} else {

echo "aim匹配失敗!
";

}

//$record = array_unique($record);

foreach($record as $r){

var_dump($r); //利用 var_dump 來查看變量類型，可以調試和直接在網頁上復制用。

}

注意：

preg_match()的第三個參數：如果提供了參數matches，它將被填充為搜索結果。 $matches[0]將包含完整模式匹配到的文本， $matches[1] 將包含第一個捕獲子組匹配到的文本，以此類推。

preg_match_all()的第三個參數：多維數組，作為輸出參數輸出所有匹配結果, 數組排序通過flags指定。

總結

以上是生活随笔為你收集整理的php作为文本进行处理,PHP处理文本和爬虫技巧的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：去眼科医院定制一副散光隐形眼镜多少钱？
下一篇： 2022-2028年中国动力电池行业深度