defmodule Crawly.Engine do
@moduledoc """
Crawly Engine - process responsible for starting and stopping spiders.
Stores all currently running spiders.
"""
require Logger
use GenServer
@type t :: %__MODULE__{
started_spiders: started_spiders(),
known_spiders: [Crawly.spider()]
}
@type started_spiders() :: %{optional(Crawly.spider()) => identifier()}
@type spider_info() :: %{
name: Crawly.spider(),
status: :stopped | :started,
pid: identifier() | nil
}
defstruct(started_spiders: %{}, known_spiders: [])
@doc """
Starts a spider. All options passed in the second argument will be passed along to the spider's `init/1` callback.
### Reserved Options
- `:crawl_id` (binary). Optional, automatically generated if not set.
- `:closespider_itemcount` (integer | disabled). Optional, overrides the close
spider item count on startup.
- `:closespider_timeout` (integer | disabled). Optional, overrides the close
spider timeout on startup.
- `:concurrent_requests_per_domain` (integer). Optional, overrides the number of
workers for a given spider
### Backward compatibility
If the 2nd positional argument is a binary, it will be set as the `:crawl_id`. Deprecated, will be removed in the future.
"""
@type crawl_id_opt :: {:crawl_id, binary()} | GenServer.option()
@spec start_spider(Crawly.spider(), opts) :: result
when opts: [crawl_id_opt],
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name, opts \\ [])
def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do
Logger.warn(
"Deprecation Warning: Setting the crawl_id as second positional argument is deprecated. Please use the :crawl_id option instead. Refer to docs for more info (https://hexdocs.pm/crawly/Crawly.Engine.html#start_spider/2) "
)
start_spider(spider_name, crawl_id: crawl_id)
end
def start_spider(spider_name, opts) when is_list(opts) do
opts =
Enum.into(opts, %{})
|> Map.put_new_lazy(:crawl_id, &UUID.uuid1/0)
# Filter all logs related to a given spider
case {Crawly.Utils.get_settings(:log_to_file, spider_name),
Crawly.Utils.ensure_loaded?(LoggerFileBackend)} do
{true, true} ->
configure_spider_logs(spider_name, opts[:crawl_id])
{true, false} ->
Logger.warn(
":logger_file_backend https://github.com/onkel-dirtus/logger_file_backend#loggerfilebackend must be installed as a peer dependency if log_to_file config is set to true"
)
_ ->
false
end
GenServer.call(
__MODULE__,
{:start_spider, spider_name, opts[:crawl_id], Map.to_list(opts)}
)
end
@spec get_manager(Crawly.spider()) :: pid() | {:error, :spider_not_found}
def get_manager(spider_name) do
case Map.fetch(running_spiders(), spider_name) do
:error ->
{:error, :spider_not_found}
{:ok, {pid_sup, _job_tag}} ->
Supervisor.which_children(pid_sup)
|> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
|> case do
nil ->
{:error, :spider_not_found}
{_, pid, :worker, _} ->
pid
end
end
end
@spec stop_spider(Crawly.spider(), reason) :: result
when reason: :itemcount_limit | :itemcount_timeout | atom(),
result:
:ok | {:error, :spider_not_running} | {:error, :spider_not_found}
def stop_spider(spider_name, reason \\ :ignore) do
GenServer.call(__MODULE__, {:stop_spider, spider_name, reason})
end
@spec list_known_spiders() :: [spider_info()]
def list_known_spiders() do
GenServer.call(__MODULE__, :list_known_spiders)
end
@spec running_spiders() :: started_spiders()
def running_spiders() do
GenServer.call(__MODULE__, :running_spiders)
end
@spec get_spider_info(Crawly.spider()) :: spider_info() | nil
def get_spider_info(spider_name) do
GenServer.call(__MODULE__, {:get_spider, spider_name})
end
def refresh_spider_list() do
GenServer.cast(__MODULE__, :refresh_spider_list)
end
def start_link(_) do
GenServer.start_link(__MODULE__, [], name: __MODULE__)
end
@spec get_crawl_id(Crawly.spider()) ::
{:error, :spider_not_running} | {:ok, binary()}
def get_crawl_id(spider_name) do
GenServer.call(__MODULE__, {:get_crawl_id, spider_name})
end
@spec init(any) :: {:ok, t()}
def init(_args) do
spiders = get_updated_known_spider_list()
{:ok, %Crawly.Engine{known_spiders: spiders}}
end
def handle_call({:get_manager, spider_name}, _, state) do
pid =
case Map.get(state.started_spiders, spider_name) do
nil ->
{:error, :spider_not_found}
pid ->
pid
end
{:reply, pid, state}
end
def handle_call({:get_crawl_id, spider_name}, _from, state) do
msg =
case Map.get(state.started_spiders, spider_name) do
nil ->
{:error, :spider_not_running}
{_pid, crawl_id} ->
{:ok, crawl_id}
end
{:reply, msg, state}
end
def handle_call(:running_spiders, _from, state) do
{:reply, state.started_spiders, state}
end
def handle_call(:list_known_spiders, _from, state) do
return = Enum.map(state.known_spiders, &format_spider_info(&1, state))
{:reply, return, state}
end
def handle_call(
{:start_spider, spider_name, crawl_id, options},
_form,
state
) do
result =
case Map.get(state.started_spiders, spider_name) do
nil ->
Crawly.EngineSup.start_spider(spider_name, options)
_ ->
{:error, :spider_already_started}
end
{msg, new_started_spiders} =
case result do
{:ok, pid} ->
# Insert information about the job into storage
Crawly.Models.Job.new(crawl_id, spider_name)
{:ok, Map.put(state.started_spiders, spider_name, {pid, crawl_id})}
{:error, _} = err ->
{err, state.started_spiders}
end
{:reply, msg, %Crawly.Engine{state | started_spiders: new_started_spiders}}
end
def handle_call({:stop_spider, spider_name, reason}, _form, state) do
{msg, new_started_spiders} =
case Map.pop(state.started_spiders, spider_name) do
{nil, _} ->
{{:error, :spider_not_running}, state.started_spiders}
{{pid, crawl_id}, new_started_spiders} ->
case Crawly.Utils.get_settings(
:on_spider_closed_callback,
spider_name
) do
nil -> :ignore
fun -> apply(fun, [spider_name, crawl_id, reason])
end
# Update jobs log information
{:stored_items, num} = Crawly.DataStorage.stats(spider_name)
Crawly.Models.Job.update(crawl_id, num, reason)
Crawly.EngineSup.stop_spider(pid)
{:ok, new_started_spiders}
end
{:reply, msg, %Crawly.Engine{state | started_spiders: new_started_spiders}}
end
def handle_call({:get_spider, spider_name}, _from, state) do
return =
if Enum.member?(state.known_spiders, spider_name) do
format_spider_info(spider_name, state)
end
{:reply, return, state}
end
def handle_cast(:refresh_spider_list, state) do
updated = get_updated_known_spider_list(state.known_spiders)
{:noreply, %Crawly.Engine{state | known_spiders: updated}}
end
# this function generates a spider_info map for each spider known
defp format_spider_info(spider_name, state) do
pid = Map.get(state.started_spiders, spider_name)
%{
name: spider_name,
status: if(is_nil(pid), do: :stopped, else: :started),
pid: pid
}
end
defp get_updated_known_spider_list(known \\ []) do
new = Crawly.Utils.list_spiders()
(known ++ new)
|> Enum.dedup_by(& &1)
end
defp configure_spider_logs(spider_name, crawl_id) do
log_file_path = Crawly.Utils.spider_log_path(spider_name, crawl_id)
Logger.add_backend({LoggerFileBackend, :debug})
Logger.configure_backend({LoggerFileBackend, :debug},
path: Crawly.Utils.spider_log_path(spider_name, crawl_id),
level: :debug,
metadata_filter: [crawl_id: crawl_id]
)
Logger.debug("Writing logs to #{log_file_path}")
end
end