crwlr

a minimal puppeteer crawler api

Usage no npm install needed!

<script type="module">
  import crwlr from 'https://cdn.skypack.dev/crwlr';
</script>

README

crwlr NPM version Build Status Dependency Status Coverage percentage

a minimal puppeteer crawler api

Huh?

  • crwlr:
    • handles the boring boilerplate work of actually crawling a site
  • You provide:
    • <String> url to start from
    • <Puppeteer Browser> browser instance with your own .launch(options)
    • pageOptions as you wish:
      • <Object> goto to be provided as options to page.goto(url, options)
      • <Function> prepare(page) binds event handlers and/or set properties for every new page
      • <Function> resolved(response, page) fires after every page.goto() has resolved

Installation

$ npm install --save crwlr

Usage

Basic Example - Without Any Options

'use strict';

const puppeteer = require('puppeteer');
const crwlr = require('crwlr');

const site = 'https://buster.neocities.org/crwlr/';

// *** Basic Example Without Any Options *** //
(async () => {
  const browser = await puppeteer.launch();
  let crawledPages = await crwlr(browser, site);
  console.log(crawledPages);
})();
/*
[ 'https://buster.neocities.org/crwlr/',
  'https://buster.neocities.org/crwlr/other.html',
  'https://buster.neocities.org/crwlr/mixed-content.html',
  'https://buster.neocities.org/crwlr/missing.html',
  'https://buster.neocities.org/crwlr/dummy.pdf' ]
*/

Advanced Example - With Options

'use strict';

const puppeteer = require('puppeteer');
const crwlr = require('crwlr');

const site = 'https://buster.neocities.org/crwlr/';

// *** Advanced Example With Options *** //
(async () => {
  const browser = await puppeteer.launch({
    headless: false
  });

  const pageOptions = {
    prepare: page => {
      page.on('request', request => {
        if (request.url().match(/\.js$/)) {
          console.log(`${page.url()} => requested: ${request.url()}`);
        }
      });
    },
    goto: {
      waitUntil: 'networkidle2'
    },
    resolved: (response, page) => {
      console.log(`=> resolved: ${response.status()} ${page.url()}`);
    }
  };

  await crwlr(browser, site, pageOptions);
})();
/*
=> resolved: 200 https://buster.neocities.org/crwlr/
=> resolved: 200 https://buster.neocities.org/crwlr/other.html
https://buster.neocities.org/crwlr/mixed-content.html => requested: https://mixed-script.badssl.com/nonsecure.js
=> resolved: 200 https://buster.neocities.org/crwlr/mixed-content.html
=> resolved: 404 https://buster.neocities.org/crwlr/missing.html
=> resolved: 200 https://buster.neocities.org/crwlr/dummy.pdf
*/

License

ISC © Buster Collings