lib/agents/scraper.ex

defmodule LangChain.Scraper do
  @moduledoc """
  A Scraper is a GenServer that scrapes natural language text and tries to turn it into some kind of
  structured data. It comes with a built in "default_scraper" that can generally extract data
  from text according to the schema you gave it.  Examples:

   {:ok, scraper_pid} = Scraper.start_link()
   input_text = "John Doe is 30 years old."
   {:ok, result} = Scraper.scrape(scraper_pid, input_text)

  {:ok, result_xml} = Scraper.scrape(scraper_pid, input_text, "default_scraper", %{
    outputFormat: "XML"
  })

  {:ok, result_yml} = Scraper.scrape(scraper_pid, input_text, "default_scraper", %{
    inputSchema: "{ name: { first: String, last: String }, age: Number }",
    outputFormat: "YAML"
  })
  """
  use GenServer

  @timeout 120_000

  alias LangChain.{ScrapeChain, ChainLink, Chat, PromptTemplate, Chain}
  # Client API


  def start_link(opts \\ []) do
    GenServer.start_link(__MODULE__, %{}, opts)
  end

  def init(_) do
    default_scrape_chain = default_scrape_chain()
    state = %{"default_scraper" => default_scrape_chain}
    {:ok, state}
  end

  @doc """
  Returns a list of all the scrape chains in the Scraper
  """
  def list(pid) do
    GenServer.call(pid, :list)
  end

  @doc """
  add your own custom scrape chain to the Scraper
  """
  def add_scrape_chain(pid, name, scrape_chain) do
    GenServer.call(pid, {:add_scrape_chain, name, scrape_chain})
  end

  @doc """
  scrape some text using the default scraper
  """
  def scrape(pid, input_text, name \\ "default_scraper", opts \\ %{}) do
    GenServer.call(pid, {:scrape, name, input_text, opts}, @timeout)
  end

  def handle_call(:list, _from, state) do
    result = Enum.map(state, fn {name, scrape_chain} ->
      {name, scrape_chain}
    end)

    {:reply, result, state}
  end

  def handle_call({:add_scrape_chain, name, scrape_chain}, _from, state) do
    new_state = Map.put(state, name, scrape_chain)
    {:reply, :ok, new_state}
  end

  def handle_call({:scrape, name, inputText, opts}, _from, state) do
    scrape_chain = Map.get(state, name)

    if is_nil(scrape_chain) do
      {:reply, {:error, "ScrapeChain not found"}, state}
    else
      # Override inputSchema or outputParser or outputFormat if provided

      inputSchema = Map.get(opts, :inputSchema, scrape_chain.inputSchema)
      outputParser = Map.get(opts, :outputParser, scrape_chain.outputParser)
      outputFormat = Map.get(opts, :outputFormat, "JSON")

      temp_scrape_chain = LangChain.ScrapeChain.new(scrape_chain.chain, inputSchema, outputParser)

      # override the outputFormat if provided

      input_variables = %{
        inputText: inputText,
        inputSchema: inputSchema,
        outputFormat: outputFormat
      }
      result = LangChain.ScrapeChain.scrape(temp_scrape_chain, input_variables)
      {:reply, {:ok, result}, state}
    end
  end

  # # todo: should I move this to the ScrapeChain module?

  defp default_scrape_chain() do
    input_schema = "{ name: String, age: Number }" # can be overruled with the inputSchema option

    chat = Chat.addPromptTemplates(%Chat{}, [
      %{
        role: "user",
        prompt: %PromptTemplate{
          template: "Schema: \"\"\"
          <%= inputSchema %>
        \"\"\"
        Text: \"\"\"
          <%= inputText %>
        \"\"\

        Extract the data from Text according to Schema and return it in <%= outputFormat %> format.
        Format any datetime fields using ISO8601 standard.
        "
        }
      }
    ])
    chain_link = %ChainLink{
      name: "schema_extractor",
      input: chat,
      outputParser: &passthru_parser/2
    }
    chain = %Chain{links: [chain_link]}
    output_parser = &output_parser/1
    scrape_chain = ScrapeChain.new(chain, input_schema, output_parser)
  end


  @doc """
  A default output parser that just returns the first response text
  """
  def passthru_parser(chain_link, outputs) do
    response_text = outputs |> List.first() |> Map.get(:text)
    %{
      chain_link |
      rawResponses: outputs,
      output: %{
        text: response_text,
      }
    }
  end

  @doc """
  A default output parser that just returns the first response text as json
  """
  def json_parser(chain_link, outputs) do
    response_text = outputs |> List.first() |> Map.get(:text)
    case Jason.decode(response_text) do
      {:ok, json} ->
        %{
          chain_link |
          rawResponses: outputs,
          output: json
        }
      {:error, response} ->
        %{
          chain_link |
          rawResponses: outputs,
          output: response_text
        }
    end
  end

  @doc """
  simple passthrough parser that just returns the result
  """
  def output_parser(result) do
    result
  end
end