lib/crawly/fetchers/splash.ex

Select File:
lib/crawly/fetchers/splash.ex

defmodule Crawly.Fetchers.Splash do
  @moduledoc """
  Implements Crawly.Fetchers.Fetcher behavior for Splash Javascript rendering.

  Splash is a lightweight QT based Javascript rendering engine. See:
  https://splash.readthedocs.io/

  Splash exposes the render.html endpoint which renders incoming requests sent
  with ?url get parameter.

  This particular Splash fetcher converts all requests made by Crawly to Splash
  requests, and  cleans up the final responses, by removing the Splash parts
  from the response.

  It's possible to start splash server in any documented way. One of the options
  is to run it locally with a help of docker:
  docker run -it -p 8050:8050 scrapinghub/splash

  In this case you have to configure the fetcher in the following way:
  `fetcher: {Crawly.Fetchers.Splash, [base_url: "http://localhost:8050/render.html"]},`
  """
  @behaviour Crawly.Fetchers.Fetcher

  require Logger

  @spec fetch(request, client_options) :: response
        when request: Crawly.Request.t(),
             client_options: [binary()],
             response: Crawly.Response.t()
  def fetch(request, client_options) do
    {base_url, other_options} =
      case Keyword.pop(client_options, :base_url, nil) do
        nil ->
          throw(
            "The base_url is not set. Splash fetcher can't be used! " <>
              "Please set :base_url in fetcher options to continue. " <>
              "For example: " <>
              "fetcher: {Crawly.Fetchers.Splash, [base_url: <url>]}"
          )

        {base_url, other_options} ->
          {base_url, other_options}
      end

    query_parameters =
      URI.encode_query(Keyword.put(other_options, :url, request.url))

    url =
      URI.merge(base_url, "?" <> query_parameters)
      |> URI.to_string()

    case HTTPoison.get(url, request.headers, request.options) do
      {:ok, response} ->
        new_request = %HTTPoison.Request{response.request | url: request.url}

        new_response = %HTTPoison.Response{
          response
          | request: new_request,
            request_url: request.url
        }

        {:ok, new_response}

      error ->
        error
    end
  end
end