Reference Script
Reference Script
This script serves as a reference that show all features of Flyscrape and how to use them. Feel free to copy and paste this as a starter script.
Reference
import { parse } from "flyscrape";
import { download } from "flyscrape/http";
import http from "flyscrape/http";
export const config = {
// Specify the URL to start scraping from.
url: "https://example.com/",
// Specify the multiple URLs to start scraping from. (default = [])
urls: [
"https://anothersite.com/",
"https://yetanother.com/",
],
// Specify how deep links should be followed. (default = 0, no follow)
depth: 5,
// Speficy the css selectors to follow. (default = ["a[href]"])
follow: [".next > a", ".related a"],
// Specify the allowed domains. ['*'] for all. (default = domain from url)
allowedDomains: ["example.com", "anothersite.com"],
// Specify the blocked domains. (default = none)
blockedDomains: ["somesite.com"],
// Specify the allowed URLs as regex. (default = all allowed)
allowedURLs: ["/posts", "/articles/\d+"],
// Specify the blocked URLs as regex. (default = none)
blockedURLs: ["/admin"],
// Specify the rate in requests per minute. (default = no rate limit)
rate: 60,
// Specify the number of concurrent requests. (default = no limit)
concurrency: 1,
// Specify a single HTTP(S) proxy URL. (default = no proxy)
proxy: "http://someproxy.com:8043",
// Specify multiple HTTP(S) proxy URLs. (default = no proxy)
proxies: [
"http://someproxy.com:8043",
"http://someotherproxy.com:8043",
],
// Enable file-based request caching. (default = no cache)
cache: "file",
// Specify the HTTP request header. (default = none)
headers: {
"Authorization": "Bearer ...",
"User-Agent": "Mozilla ...",
},
// Specify the output options.
output: {
// Specify the output file. (default = stdout)
file: "results.json",
// Specify the output format. (default = json)
// Options: "json" | "ndjson"
format: "json",
},
};
// Optional setup function, called before scraping starts.
export function setup() {
// Fetch login form.
const { body } = http.get("http://example.com/login");
// Extract csrf token from form.
const csrf = parse(body).find("input[name=csrf]").attr("value");
// Submit login form.
http.postForm("http://example.com/login", {
"username": "jondoe",
"password": "supersecret",
"csrf": csrf,
});
}
export default function ({ doc, url, absoluteURL }) {
// doc - Contains the parsed HTML document
// url - Contains the scraped URL
// absoluteURL(...) - Transforms relative URLs into absolute URLs
// Find all users.
const userlist = doc.find(".user")
// Download the profile picture of each user.
userlist.each(user => {
const name = user.find(".name").text()
const pictureURL = absoluteURL(user.find("img").attr("src"));
download(pictureURL, `profile-pictures/${name}.jpg`)
})
// Return users name, address and age.
return {
users: userlist.map(user => {
const name = user.find(".name").text()
const address = user.find(".address").text()
const age = user.find(".age").text()
return { name, address, age };
})
};
}