利用puppeteer采集自己想要的书籍
安装node
cd /usr/local/src/wget http://nodejs.org/dist/v0.10.24/node-v0.10.24.tar.gztar zxvf node-v0.10.24.tar.gzcd node-v0.10.24./configure --prefix=/usr/local/node/0.10.24makemake installnode -v
安装完node之后npm就安装好了
vagrant@homestead:~/code/Ecc3.0_System01$ npm -v
切换cnpm国内源
╰$ npm install -g cnpm --registry=https://registry.npm.taobao.org
安装puppeteer
╰$ cnpm i puppeteer
测试脚本
const puppeteer = require('puppeteer'); (async () => { try { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://www.baidu.com/'); // 等待时间 await page.waitFor(1000*2); // 截图 await page.screenshot({path:'./output/baidu.png',fullPage:true}); // pdf await page.pdf({path:'./output/baidu.pdf',format:"A4",printBackground:true}); // 输入内容 await page.type('#kw','Python',{delay:true}); // 触发内容 await page.click('#su'); // 等待时间 await page.waitFor(1000*5); await page.setViewport({ width:1920, height:1080 }); // 截图 await page.screenshot({path:'./output/baidu_python.png',fullPage:true}); // pdf await page.pdf({path:'./output/baidu_python.pdf',format:"a4",printBackground:true}); await browser.close(); } catch (error) { console.log(`this is the ${error}`); } })();
结果
批量下载 Es6文档
const puppeteer = require("puppeteer");(async() => { try { const browser = await puppeteer.launch({ headless:true, args: [ '–disable-gpu', // GPU硬件加速 '–disable-dev-shm-usage', // 创建临时文件共享内存 '–disable-setuid-sandbox', // uid沙盒 '–no-first-run', // 没有设置首页。在启动的时候,就会打开一个空白页面。 '–no-sandbox', // 沙盒模式 '–no-zygote', '–single-process' // 单进程运行 ] }); const page = await browser.newPage(); await page.goto('http://es6.ruanyifeng.com/#README',{ 'timeout': 0 //无限大 }); let aTags = await (await page).evaluate(() => { let as = [...document.querySelectorAll('ol li a')]; return as.map((a) =>{ return { href: a.href.trim(), name: a.text } }); }); // console.log(aTags) // await page.pdf({path: `./output/${aTags[0].name}.pdf`,format:'a4',printBackground:true}); for (var i = 1; i < aTags.length;i++){ pageS = await browser.newPage(); var a = aTags[i]; console.log("完成个数:"+i); await pageS.goto(a.href,{'timeout': 0}); await pageS.pdf({path: `/Users/shiyuxiang/develop/www/Ecc3.0_System01/output/${a.name}.pdf`,format:'a4'}); pageS.close(); } console.log("完成") browser.close(); } catch (err) { console.log(`this is the ${err}`); } })();
执行
╰$ node crawl.js
结果
将PDF可并到一块
╰$ sudo apt-get install pdftk╰$ cnpm i pdf-merge
脚本
const PDFMerge = require('pdf-merge');const path = require('path'); const fs = require('fs');// const { formatTime } = require('./modules/utils'); /** * @desc 返回路径 * @param {String} dir, dir2 字符串 * @return {String} 路径 */ function resolve(dir, dir2 = ''){ return path.posix.join(__dirname, './', dir, dir2);} // 配置 const config = { entry: './output/', output: './data/' }; // const filenameArr = fs.readdirSync(resolve(config.entry));const sortedFilenameArr = filenameArr.sort((str1, str2) => { let regex = /^(\d{1,2})\./; let a = +str1.match(regex); let b = +str2.match(regex); return a - b;});// console.log(sortedFilenameArr);const files = sortedFilenameArr.map((el) => { return resolve(`${config.entry}${el}`);});console.log('files', files); const outputPath = resolve(config.output);const isExists = fs.existsSync(outputPath);console.log('isExists', isExists, 'outputPath', outputPath);/** * @desc 创建输出路径 */ function mkdirOutputpath(){ try{ fs.mkdirSync(outputPath); console.log('mkdir is successful!'); } catch(e){ console.log('mkdir is failed!', e); }};// 如果不存在 则创建if(!isExists){ mkdirOutputpath();}console.log('let\'s start merge...');const filename = `ES6 入门教程-${Date.now()}.pdf`; // console.log(filename); const output = resolve(`${config.output}${filename}`);// console.log(output);// Save as new filePDFMerge(files, { output: output,}).then((buffer) => { console.log('merge is successful!');});
结果
赞 (0)