«

PHP simple_html_dom 使用记录

时间:2024-4-30 11:04     作者:杨佳乐     分类: PHP


 官方文档使用file_get_html乱码问题处理
              先使用file_get_contents获取页面内容然后转换编码,再使用str_get_html
              mb_convert_encoding($content,'UTF-8',"auto");

        //保存图片路径
        $path=IA_ROOT."/images/";
        // Fetch the html content
        $url = "要抓取的微信文章网址";
        $content = file_get_contents($url);

        $url_info = explode('/', trim($url, '/'));
        $url_info = array_reverse($url_info);
        $page_name = $url_info[0] . '.html';

        // Fetch the images of the page

        //引入simple_html_dom文件 下载地址,下方提供

        require "simple_html_dom.php";

        $html = str_get_html($content);

        // Fetch the real path of the imgaes and download them
        $images = $html->find('img');
        foreach ($images as $index => $image) {
            $image_url = $image->getAttribute('data-src');
            if ($image_url) {
                $image_file = file_get_contents($image_url);
                $url_info = explode('/', trim($image_url, '/'));
                $url_info = array_reverse($url_info);
                $file_name = $url_info[1] . '.jpg';
                $new_image_url = 'images' . DIRECTORY_SEPARATOR . $file_name;
                file_put_contents($path.$file_name, $image_file);
                //设置src 属性内容
                $html->find('img', $index)->setAttribute('src',"/attachment/wechat_article_image/".$file_name);

            }
        }

        // Ftech the real url of the iframe
        $iframes = $html->find('iframe');
        foreach ($iframes as $index => $iframe) {
            $iframe_url = $iframe->getAttribute('data-src');
            // $iframe_w                            = $iframe->getAttribute('data-w');
            $html->find('iframe', $index)->src = $iframe_url;
            // TODO need to calculate the width and height of the video

        }

        //获取div id为 js_content内容
        $res=$html->find('div[id=js_content]',0);
        echo $res;die;
        $doc=$html;
        //修改后的内容
        echo $doc;die;