Reference Script

Reference Script

This script serves as a reference that show all features of Flyscrape and how to use them. Feel free to copy and paste this as a starter script.

Reference
import { parse } from "flyscrape";
import { download } from "flyscrape/http";
import http from "flyscrape/http";

export const config = {
    // Specify the URL to start scraping from.
    url: "https://example.com/",

    // Specify the multiple URLs to start scraping from.   (default = [])
    urls: [                          
        "https://anothersite.com/",
        "https://yetanother.com/",
    ],

    // Specify how deep links should be followed.          (default = 0, no follow)
    depth: 5,                        

    // Speficy the css selectors to follow.                (default = ["a[href]"])
    follow: [".next > a", ".related a"],                      
 
    // Specify the allowed domains. ['*'] for all.         (default = domain from url)
    allowedDomains: ["example.com", "anothersite.com"],              
 
    // Specify the blocked domains.                        (default = none)
    blockedDomains: ["somesite.com"],              

    // Specify the allowed URLs as regex.                  (default = all allowed)
    allowedURLs: ["/posts", "/articles/\d+"],                 
 
    // Specify the blocked URLs as regex.                  (default = none)
    blockedURLs: ["/admin"],                 
   
    // Specify the rate in requests per minute.            (default = no rate limit)
    rate: 60,                       

    // Specify the number of concurrent requests.          (default = no limit)
    concurrency: 1,                       

    // Specify a single HTTP(S) proxy URL.                 (default = no proxy)
    proxy: "http://someproxy.com:8043",

    // Specify multiple HTTP(S) proxy URLs.                (default = no proxy)
    proxies: [
      "http://someproxy.com:8043",
      "http://someotherproxy.com:8043",
    ],                     

    // Enable file-based request caching.                  (default = no cache)
    cache: "file",                   

    // Specify the HTTP request header.                    (default = none)
    headers: {                       
        "Authorization": "Bearer ...",
        "User-Agent": "Mozilla ...",
    },

    // Specify the output options.
    output: {
        // Specify the output file.                        (default = stdout)
        file: "results.json",
        
        // Specify the output format.                      (default = json)
        // Options: "json" | "ndjson"
        format: "json",
    },
};

// Optional setup function, called before scraping starts.
export function setup() {
    // Fetch login form.
    const { body } = http.get("http://example.com/login");

    // Extract csrf token from form.
    const csrf = parse(body).find("input[name=csrf]").attr("value");

    // Submit login form.
    http.postForm("http://example.com/login", {
      "username": "jondoe",
      "password": "supersecret",
      "csrf": csrf,
    });
}

export default function ({ doc, url, absoluteURL }) {
  // doc              - Contains the parsed HTML document
  // url              - Contains the scraped URL
  // absoluteURL(...) - Transforms relative URLs into absolute URLs

  // Find all users.
  const userlist = doc.find(".user")

  // Download the profile picture of each user.
  userlist.each(user => {
    const name = user.find(".name").text()
    const pictureURL = absoluteURL(user.find("img").attr("src"));

    download(pictureURL, `profile-pictures/${name}.jpg`)
  })

  // Return users name, address and age.
  return {
    users: userlist.map(user => {
      const name = user.find(".name").text()
      const address = user.find(".address").text()
      const age = user.find(".age").text()

      return { name, address, age };
    })
  };
}