Welcome 微信登录

首页 / 脚本样式 / JavaScript / nodejs通过phantomjs实现下载网页

功能其实很见简单,通过 phantomjs.exe 采集 url 加载的资源,通过子进程的方式,启动nodejs 加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源
当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下
 首先当然是下载 nodejs 和 phantomjs
下面是 phantomjs.exe 执行的 down.js

var page = require("webpage").create(),system = require("system");var spawn = require("child_process").spawnif (system.args.length === 1) {console.log("Usage: netsniff.js <some URL>");phantom.exit(1);} else {var urls = [];page.address = system.args[1];page.onResourceReceived = function (res) {if (res.stage === "start") {urls.push(res.url);}};page.open(page.address, function (status) {var har;if (status !== "success") {console.log("FAIL to load the address");phantom.exit(1);} else {console.log("down resource " + urls.length + " urls.");var child = spawn("node", ["--harmony", "downHtml.js", urls.join(",")])child.stdout.on("data", function (data) { console.log(data);})child.stderr.on("data", function (data) { console.log(data);})child.on("exit", function (code) { phantom.exit();})}});}
下面是对应的node运行的 downHtml.js
"use strict";var fs = require("fs");var http = require("http");var path = require("path");var r_url = require("url");var dirCache = {};//缓存减少判断function makedir (pathStr, callback) {if (dirCache[pathStr] == 1) {callback();} else {fs.exists(pathStr, function (exists) {if (exists == true) {dirCache[pathStr] == 1;callback();} else {makedir(path.dirname(pathStr), function () {fs.mkdir(pathStr, function () {dirCache[pathStr] == 1;callback();})});}})}};var reg = /[:,]s*url([""]?.*?(1))/gvar reg2 = /(([""]?)(.*?)(1))/var isDownMap = {};var downImgFromCss = function (URL) {http.get(URL, function(res) {//console.log(path.resolve(process.cwd(), "index.min.css"))//res.pipe(fs.createWriteStream(path.resolve(process.cwd(), "index.min.css")));var body = "";res.setEncoding("utf8");res.on("data", function (chunk) {body += chunk;});res.on("end", function () {var match = body.match(reg);for (var i = 0, len = match.length; i < len; i++){var m = match[i].match(reg2);if (m && m[2]) {var url = m[2];let imgUrl = r_url.resolve(URL, url);if (!isDownMap[imgUrl]) {var uo = r_url.parse(imgUrl);let filepath = CWD + "/" + uo.hostname + uo.pathname;makedir(path.dirname(filepath), function () {http.get(imgUrl, function (res) {res.pipe(fs.createWriteStream(filepath));})})isDownMap[imgUrl] = 1;}}}});});}var URLS = process.argv[2].split(",");var CWD = process.cwd();//下载资源URLS.forEach(function (URL) {var uo = r_url.parse(URL);var filepath;if (uo.pathname == "/" || uo.pathname == "") {filepath = CWD + "/" + uo.hostname + "/index.html";} else {filepath = CWD + "/" + uo.hostname + uo.pathname;}makedir(path.dirname(filepath), function () {http.get(URL, function (res) {if (URL.indexOf(".css") != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf("text/css")!= -1)) {console.log("down images form css file:" + URL + ".");downImgFromCss(URL);}res.pipe(fs.createWriteStream(filepath));})});});
down.js downHtml.js 放在同一个文件夹下 通过下列 cmd 运行
D:phantomjs-2.0.0-windowsinphantomjs.exe down.js http://www.youku.com/
以上所述就是本文的全部内容了,希望大家能够喜欢。