xvi-phantom-scraperdeprecated

A scraper based on phantom package, including cheerio for easy parsing.

Usage no npm install needed!

<script type="module">
  import xviPhantomScraper from 'https://cdn.skypack.dev/xvi-phantom-scraper';
</script>

README

xvi-phantom-scraper

A scraper based on phantom package, including cheerio for easy parsing.

Example

//Add package
const Scraper = require('xvi-phantom-scraper');

//Create instance of the scraper
var scraper = new Scraper({
    sources: [{ //array of sources, each of them will be crawled sequentially when calling the scraper.once() method.
        name: 'wikipedia-fruits', // name of this source
        url: 'https://simple.wikipedia.org/wiki/List_of_fruits', // url of the source
        waitBeforeHandler: false, //put the duration in ms you want to wait before retrieving the content of page. This can be useful to wait for JS execution
        handler: async function(content, $, spider) { //handler to apply on the content of the page
            //console.log(content);
            //console.log($);
            //console.log(spider);

            $('.mw-body-content td.navbox-list a').each(function() {
                var text = $(this).text();
                var link = $(this).attr('href');
                console.log(`- Found ${text} <${link}>`);
            })
            spider.exit(); //spider object contains phantom objects in properties like spider.instance, spider.page, the source information in spider.opts
            //, and two methods spider.exit() used to close the page and destroy the spider (important)
            // spider.screenshot(opts) to take a screenshot of the current page.
        }
    }]
});

//test function
async function test() {
    try {
        //crawl each source sequentially
        await scraper.once();
    }
    catch (err) {
        console.log(err);
    }
}

test();

Will output:

/xvi-phantom-scraper$ node test.js
- Found Achene </w/index.php?title=Achene&action=edit&redlink=1>
- Found Berry </wiki/Berry>
- Found Capsule </w/index.php?title=Capsule_(fruit)&action=edit&redlink=1>
- Found Caryopsis </w/index.php?title=Caryopsis&action=edit&redlink=1>
- Found Drupe </wiki/Drupe>
...