利用puppeteer采集自己想要的书籍

安装node


cd /usr/local/src/wget http://nodejs.org/dist/v0.10.24/node-v0.10.24.tar.gztar zxvf node-v0.10.24.tar.gzcd node-v0.10.24./configure --prefix=/usr/local/node/0.10.24makemake installnode -v

安装完node之后npm就安装好了

vagrant@homestead:~/code/Ecc3.0_System01$ npm -v

切换cnpm国内源


╰$ npm install -g cnpm --registry=https://registry.npm.taobao.org

安装puppeteer


╰$ cnpm i puppeteer

测试脚本


const puppeteer = require('puppeteer');  (async () => {
    try {
        const browser = await puppeteer.launch();
        const page = await browser.newPage();
        await page.goto('https://www.baidu.com/');

        // 等待时间
        await page.waitFor(1000*2);

        // 截图
        await page.screenshot({path:'./output/baidu.png',fullPage:true});

        // pdf
        await page.pdf({path:'./output/baidu.pdf',format:"A4",printBackground:true});

        // 输入内容
        await page.type('#kw','Python',{delay:true});
        // 触发内容
        await page.click('#su');

        // 等待时间
        await page.waitFor(1000*5);

        await page.setViewport({
            width:1920,
            height:1080
        });

        // 截图
        await page.screenshot({path:'./output/baidu_python.png',fullPage:true});

        // pdf
        await page.pdf({path:'./output/baidu_python.pdf',format:"a4",printBackground:true});

        await browser.close();
    } catch (error) {
        console.log(`this is the ${error}`);

    }       })();

结果


批量下载 Es6文档


const puppeteer = require("puppeteer");(async() => {
    try {
        const browser = await puppeteer.launch({
            headless:true,
            args: [
                '–disable-gpu', // GPU硬件加速
                '–disable-dev-shm-usage', // 创建临时文件共享内存
                '–disable-setuid-sandbox', // uid沙盒
                '–no-first-run', // 没有设置首页。在启动的时候,就会打开一个空白页面。
                '–no-sandbox', // 沙盒模式
                '–no-zygote',
                '–single-process' // 单进程运行
            ]
        });
        const page = await browser.newPage();

        await page.goto('http://es6.ruanyifeng.com/#README',{
            'timeout': 0 //无限大
        });
        let aTags = await (await page).evaluate(() => {
            let as = [...document.querySelectorAll('ol li a')];
            return as.map((a) =>{
                return {
                  href: a.href.trim(),
                  name: a.text
                }
            });
        });
        // console.log(aTags)
        // await page.pdf({path: `./output/${aTags[0].name}.pdf`,format:'a4',printBackground:true});  

        for (var i = 1; i < aTags.length;i++){
            pageS = await browser.newPage();
            var a = aTags[i];
            console.log("完成个数:"+i);
            await pageS.goto(a.href,{'timeout': 0});
            await pageS.pdf({path: `/Users/shiyuxiang/develop/www/Ecc3.0_System01/output/${a.name}.pdf`,format:'a4'});
            pageS.close();
        }
        console.log("完成")
        browser.close();
    } catch (err) {
        console.log(`this is the ${err}`);
    }                   })();

执行


╰$ node crawl.js

结果


将PDF可并到一块

╰$ sudo apt-get install pdftk╰$ cnpm i pdf-merge

脚本

const PDFMerge = require('pdf-merge');const path = require('path');
const fs = require('fs');//
const { formatTime } = require('./modules/utils');
/**
 * @desc 返回路径
 * @param {String} dir, dir2 字符串
 * @return {String} 路径
 */
 function resolve(dir, dir2 = ''){
return path.posix.join(__dirname, './', dir, dir2);}
// 配置
const config = {
entry: './output/',
output: './data/'
};   //
const filenameArr = fs.readdirSync(resolve(config.entry));const sortedFilenameArr = filenameArr.sort((str1, str2) => {
let regex = /^(\d{1,2})\./;
let a = +str1.match(regex);
let b = +str2.match(regex);
return a - b;});// console.log(sortedFilenameArr);const files = sortedFilenameArr.map((el) => {
return resolve(`${config.entry}${el}`);});console.log('files', files);   const outputPath = resolve(config.output);const isExists = fs.existsSync(outputPath);console.log('isExists', isExists, 'outputPath', outputPath);/**
 * @desc 创建输出路径
 */
 function mkdirOutputpath(){
try{
fs.mkdirSync(outputPath);
console.log('mkdir is successful!');
} catch(e){
console.log('mkdir is failed!', e);
}};// 如果不存在 则创建if(!isExists){
mkdirOutputpath();}console.log('let\'s start merge...');const filename = `ES6 入门教程-${Date.now()}.pdf`;    // console.log(filename);   const output = resolve(`${config.output}${filename}`);// console.log(output);// Save as new filePDFMerge(files, {
output: output,}).then((buffer) => {
console.log('merge is successful!');});

结果

(0)

相关推荐