PHP simple_html_dom 使用记录
时间:2024-4-30 11:04 作者:杨佳乐 分类: PHP
官方文档使用file_get_html乱码问题处理
先使用file_get_contents获取页面内容然后转换编码,再使用str_get_html
mb_convert_encoding($content,'UTF-8',"auto");
//保存图片路径
$path=IA_ROOT."/images/";
// Fetch the html content
$url = "要抓取的微信文章网址";
$content = file_get_contents($url);
$url_info = explode('/', trim($url, '/'));
$url_info = array_reverse($url_info);
$page_name = $url_info[0] . '.html';
// Fetch the images of the page
//引入simple_html_dom文件 下载地址,下方提供
require "simple_html_dom.php";
$html = str_get_html($content);
// Fetch the real path of the imgaes and download them
$images = $html->find('img');
foreach ($images as $index => $image) {
$image_url = $image->getAttribute('data-src');
if ($image_url) {
$image_file = file_get_contents($image_url);
$url_info = explode('/', trim($image_url, '/'));
$url_info = array_reverse($url_info);
$file_name = $url_info[1] . '.jpg';
$new_image_url = 'images' . DIRECTORY_SEPARATOR . $file_name;
file_put_contents($path.$file_name, $image_file);
//设置src 属性内容
$html->find('img', $index)->setAttribute('src',"/attachment/wechat_article_image/".$file_name);
}
}
// Ftech the real url of the iframe
$iframes = $html->find('iframe');
foreach ($iframes as $index => $iframe) {
$iframe_url = $iframe->getAttribute('data-src');
// $iframe_w = $iframe->getAttribute('data-w');
$html->find('iframe', $index)->src = $iframe_url;
// TODO need to calculate the width and height of the video
}
//获取div id为 js_content内容
$res=$html->find('div[id=js_content]',0);
echo $res;die;
$doc=$html;
//修改后的内容
echo $doc;die;