日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程语言 > php >内容正文

php

php queryList函数,QueryList/QueryList.php at master · baijunyao/QueryList · GitHub

發布時間:2025/4/5 php 21 豆豆
生活随笔 收集整理的這篇文章主要介紹了 php queryList函数,QueryList/QueryList.php at master · baijunyao/QueryList · GitHub 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

namespace QL;

use phpQuery,Exception,ReflectionClass;

use Monolog\Logger;

use Monolog\Handler\StreamHandler;

/**

* QueryList

*

* 一個基于phpQuery的通用列表采集類

*

* @author Jaeger

* @email 734708094@qq.com

* @link http://git.oschina.net/jae/QueryList

* @version 3.1.2

*

* @example

*

//獲取CSDN移動開發欄目下的文章列表標題

$hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));

print_r($hj->data);

//回調函數1

function callfun1($content,$key)

{

return '回調函數1:'.$key.'-'.$content;

}

class HJ{

//回調函數2

static public function callfun2($content,$key)

{

return '回調函數2:'.$key.'-'.$content;

}

}

//獲取CSDN文章頁下面的文章標題和內容

$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';

$rules = array(

'title'=>array('h1','text','','callfun1'), //獲取純文本格式的標題,并調用回調函數1

'summary'=>array('.summary','text','-input strong'), //獲取純文本的文章摘要,但保strong標簽并去除input標簽

'content'=>array('.news_content','html','div a -.copyright'), //獲取html格式的文章內容,但過濾掉div和a標簽,去除類名為copyright的元素

'callback'=>array('HJ','callfun2') //調用回調函數2作為全局回調函數

);

$rang = '.left';

$hj = QueryList::Query($url,$rules,$rang);

print_r($hj->data);

//繼續獲取右邊相關熱門文章列表的標題以及鏈接地址

$hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');

//輸出數據

echo $hj->getData();

*/

class QueryList

{

public $data;

public $html;

private $page;

private $pqHtml;

private $outputEncoding = false;

private $inputEncoding = false;

private $htmlEncoding;

public static $logger = null;

public static $instances;

public function __construct() {

}

/**

* 靜態方法,訪問入口

* @param string $page 要抓取的網頁URL地址(支持https);或者是html源代碼

* @param array $rules 【選擇器數組】說明:格式array("名稱"=>array("選擇器","類型"[,"標簽過濾列表"][,"回調函數"]),.......[,"callback"=>"全局回調函數"]);

* 【選擇器】說明:可以為任意的jQuery選擇器語法

* 【類型】說明:值 "text" ,"html" ,"HTML標簽屬性" ,

* 【標簽過濾列表】:可選,要過濾的選擇器名,多個用空格隔開,當標簽名前面添加減號(-)時(此時標簽可以為任意的元素選擇器),表示移除該標簽以及標簽內容,否則當【類型】值為text時表示需要保留的HTML標簽,為html時表示要過濾掉的HTML標簽

* 【回調函數】/【全局回調函數】:可選,字符串(函數名) 或 數組(array("類名","類的靜態方法")),回調函數應有倆個參數,第一個參數是選擇到的內容,第二個參數是選擇器數組下標,回調函數會覆蓋全局回調函數

*

* @param string $range 【塊選擇器】:指 先按照規則 選出 幾個大塊 ,然后再分別再在塊里面 進行相關的選擇

* @param string $outputEncoding【輸出編碼格式】指要以什么編碼輸出(UTF-8,GB2312,.....),防止出現亂碼,如果設置為 假值 則不改變原字符串編碼

* @param string $inputEncoding 【輸入編碼格式】明確指定輸入的頁面編碼格式(UTF-8,GB2312,.....),防止出現亂碼,如果設置為 假值 則自動識別

* @param bool|false $removeHead 【是否移除頁面頭部區域】 亂碼終極解決方案

* @return mixed

*/

public static function Query($page,array $rules, $range = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)

{

return self::getInstance()->_query($page, $rules, $range, $outputEncoding, $inputEncoding,$removeHead);

}

/**

* 運行QueryList擴展

* @param $class

* @param array $args

* @return mixed

* @throws Exception

*/

public static function run($class,$args = array())

{

$extension = self::getInstance("QL\\Ext\\{$class}");

return $extension->run($args);

}

/**

* 日志設置

* @param $handler

*/

public static function setLog($handler)

{

if(class_exists(Logger::class))

{

if(is_string($handler))

{

$handler = new StreamHandler($handler,Logger::INFO);

}

self::$logger = new Logger('QueryList');

self::$logger->pushHandler($handler);

}else{

throw new Exception("You need to install the package [monolog/monolog]");

}

}

/**

* 獲取任意實例

* @return mixed

* @throws Exception

*/

public static function getInstance()

{

$args = func_get_args();

count($args) || $args = array('QL\QueryList');

$key = md5(serialize($args));

$className = array_shift($args);

if(!class_exists($className)) {

throw new Exception("no class {$className}");

}

if(!isset(self::$instances[$key])) {

$rc = new ReflectionClass($className);

self::$instances[$key] = $rc->newInstanceArgs($args);

}

return self::$instances[$key];

}

/**

* 獲取目標頁面源碼(主要用于調試)

* @param bool|true $rel

* @return string

*/

public function getHtml($rel = true)

{

return $rel?$this->qpHtml:$this->html;

}

/**

* 獲取采集結果數據

* @param callback $callback

* @return array

*/

public function getData($callback = null)

{

if(is_callable($callback)){

return array_map($callback,$this->data);

}

return $this->data;

}

/**

* 重新設置選擇器

* @param $rules

* @param string $range

* @param string $outputEncoding

* @param string $inputEncoding

* @param bool|false $removeHead

* @return QueryList

*/

public function setQuery(array $rules, $range = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)

{

return $this->_query($this->html,$rules, $range, $outputEncoding, $inputEncoding,$removeHead);

}

private function _query($page,array $rules, $range, $outputEncoding, $inputEncoding,$removeHead)

{

$this->data = array();

$this->page = $page;

$this->html = $this->_isURL($this->page)?$this->_request($this->page):$this->page;

$outputEncoding && $this->outputEncoding = $outputEncoding;

$inputEncoding && $this->inputEncoding = $inputEncoding;

$removeHead && $this->html = $this->_removeHead($this->html);

$this->pqHtml = '';

if(empty($this->html)){

$this->_log('The received content is empty!','error');

trigger_error('The received content is empty!',E_USER_NOTICE);

}

//獲取編碼格式

$this->htmlEncoding = $this->inputEncoding?$this->inputEncoding:$this->_getEncode($this->html);

// $this->html = $this->_removeTags($this->html,array('script','style'));

$this->regArr = $rules;

$this->regRange = $range;

$this->_getList();

$this->_log();

return $this;

}

private function _getList()

{

$this->inputEncoding && phpQuery::$defaultCharset = $this->inputEncoding;

$document = phpQuery::newDocumentHTML($this->html);

$this->qpHtml = $document->htmlOuter();

if (!empty($this->regRange)) {

$robj = pq($document)->find($this->regRange);

$i = 0;

foreach ($robj as $item) {

while (list($key, $reg_value) = each($this->regArr)) {

if($key=='callback')continue;

$tags = isset($reg_value[2])?$reg_value[2]:'';

$iobj = pq($item)->find($reg_value[0]);

switch ($reg_value[1]) {

case 'text':

$this->data[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);

break;

case 'html':

$this->data[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);

break;

default:

$this->data[$i][$key] = pq($iobj)->attr($reg_value[1]);

break;

}

if(isset($reg_value[3])){

$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);

}else if(isset($this->regArr['callback'])){

$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);

}

}

//重置數組指針

reset($this->regArr);

$i++;

}

} else {

while (list($key, $reg_value) = each($this->regArr)) {

if($key=='callback')continue;

$document = phpQuery::newDocumentHTML($this->html);

$tags = isset($reg_value[2])?$reg_value[2]:'';

$lobj = pq($document)->find($reg_value[0]);

$i = 0;

foreach ($lobj as $item) {

switch ($reg_value[1]) {

case 'text':

$this->data[$i][$key] = $this->_allowTags(pq($item)->html(),$tags);

break;

case 'html':

$this->data[$i][$key] = $this->_stripTags(pq($item)->html(),$tags);

break;

default:

$this->data[$i][$key] = pq($item)->attr($reg_value[1]);

break;

}

if(isset($reg_value[3])){

$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);

}else if(isset($this->regArr['callback'])){

$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);

}

$i++;

}

}

}

if ($this->outputEncoding) {

//編碼轉換

$this->data = $this->_arrayConvertEncoding($this->data, $this->outputEncoding, $this->htmlEncoding);

}

phpQuery::$documents = array();

}

/**

* URL請求

* @param $url

* @return string

*/

private function _request($url)

{

if(function_exists('curl_init')){

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);

curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);

curl_setopt($ch, CURLOPT_AUTOREFERER, true);

curl_setopt($ch, CURLOPT_REFERER, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36');

$result = curl_exec($ch);

curl_close($ch);

}elseif(version_compare(PHP_VERSION, '5.0.0')>=0){

$opts = array(

'http' => array(

'header' => "Referer:{$url}"

)

);

$result = file_get_contents($url,false,stream_context_create($opts));

}else{

$result = file_get_contents($url);

}

return $result;

}

/**

* 移除頁面head區域代碼

* @param $html

* @return mixed

*/

private function _removeHead($html)

{

return preg_replace('/

.+/is','',$html);

}

/**

* 獲取文件編碼

* @param $string

* @return string

*/

private function _getEncode($string)

{

return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));

}

/**

* 轉換數組值的編碼格式

* @param array $arr

* @param string $toEncoding

* @param string $fromEncoding

* @return array

*/

private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)

{

eval('$arr = '.iconv($fromEncoding, $toEncoding.'//IGNORE', var_export($arr,TRUE)).';');

return $arr;

}

/**

* 簡單的判斷一下參數是否為一個URL鏈接

* @param string $str

* @return boolean

*/

private function _isURL($str)

{

if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {

return true;

}

return false;

}

/**

* 去除特定的html標簽

* @param string $html

* @param string $tags_str 多個標簽名之間用空格隔開

* @return string

*/

private function _stripTags($html,$tags_str)

{

$tagsArr = $this->_tag($tags_str);

$html = $this->_removeTags($html,$tagsArr[1]);

$p = array();

foreach ($tagsArr[0] as $tag) {

$p[]="/(]*>)/i";

}

$html = preg_replace($p,"",trim($html));

return $html;

}

/**

* 保留特定的html標簽

* @param string $html

* @param string $tags_str 多個標簽名之間用空格隔開

* @return string

*/

private function _allowTags($html,$tags_str)

{

$tagsArr = $this->_tag($tags_str);

$html = $this->_removeTags($html,$tagsArr[1]);

$allow = '';

foreach ($tagsArr[0] as $tag) {

$allow .= " ";

}

return strip_tags(trim($html),$allow);

}

private function _tag($tags_str)

{

$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);

$tags = array(array(),array());

foreach($tagArr as $tag)

{

if(preg_match('/-(.+)/', $tag,$arr))

{

array_push($tags[1], $arr[1]);

}else{

array_push($tags[0], $tag);

}

}

return $tags;

}

/**

* 移除特定的html標簽

* @param string $html

* @param array $tags 標簽數組

* @return string

*/

private function _removeTags($html,$tags)

{

$tag_str = '';

if(count($tags))

{

foreach ($tags as $tag) {

$tag_str .= $tag_str?','.$tag:$tag;

}

phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;

$doc = phpQuery::newDocumentHTML($html);

pq($doc)->find($tag_str)->remove();

$html = pq($doc)->htmlOuter();

$doc->unloadDocument();

}

return $html;

}

/**

* 打印日志

* @param string $message

* @param string $level

*/

private function _log($message = '',$level = 'info')

{

if(!is_null(self::$logger))

{

$url = $this->_isURL($this->page)?$this->page:'[html]';

$count = count($this->data);

$level = empty($level)?($count?'info':'warning'):$level;

$message = empty($message)?($count?'Get data successfully':'Get data failed'):$message;

self::$logger->$level($message,array(

'page' => $url,

'count' => $count

));

}

}

}

/*

class Autoload

{

public static function load($className)

{

$files = array(

sprintf('%s/extensions/%s.php',__DIR__,$className),

sprintf('%s/extensions/vendors/%s.php',__DIR__,$className)

);

foreach ($files as $file) {

if(is_file($file)){

require $file;

return true;

}

}

return false;

}

}

spl_autoload_register(array('Autoload','load'));

*/

總結

以上是生活随笔為你收集整理的php queryList函数,QueryList/QueryList.php at master · baijunyao/QueryList · GitHub的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。