defmodule ElixirRss do
@moduledoc """
ElixirRss is a simple feed parser originally meant to parse the feed from crypto news.
## Examples
```elixir
> {:ok, feed} = ElixirRss.fetch_and_parse "https://cointelegraph.com/rss"
...
> {:ok, feed} = ElixirRss.parse "<rss version=\"2.0\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" ..."
...
> feed.title
"Cointelegraph.com News"
> feed.entries |> Enum.map(&(&1.title))
["Kazakhstan among top 3 Bitcoin mining destinations after US and China", "3Commas issues security alert as FTX deletes API keys following hack", ...]
```
## Results
#### Feed
- `id` feed identifier (usually the site url)
- `title` feed title
- `description` feed description
- `url` feed url
- `site_url` feed main site url
- `updated` feed last modification timestamp
- `entries` entry list
#### Entry
- `id` unique identifier (sha256)
- `title` entry title
- `url` entry permalink
- `content` entry content
- `image` url of the thumbnail
- `updated` entry publication or modification timestamp
"""
require Logger
alias ElixirRss.Helpers.{Fetch, Sanitizer}
alias ElixirRss.Parsers.{Atom, RSS1, RSS2}
@doc """
Parses a `xml` string.
## Examples
iex> ElixirRss.parse "<rss version="2.0"><channel><title>9GAG RSS feed</title><description>Free 9GAG RSS feed</description>..."
{:ok, %{id: "http://9gagrss.com/", title: "9GAG RSS feed", description: "Free 9GAG RSS feed"...}}
iex> ElixirRss.parse "foo"
{:error, :invalid_xml}
iex> ElixirRss.parse("<!DOCTYPE html><html lang="en"><head><meta charset...")
{:error, :unknown_feed_format}
"""
def parse(xml, url \\ "") do
with {:ok, doc} <- read_xml_doc(xml),
{:ok, parser} <- select_parser(doc),
{:ok, feed} <- parser.parse(doc, url) do
entries =
feed.entries
|> Enum.filter(fn e -> e.title && e.content end)
|> Enum.sort(&(DateTime.compare(&1[:updated], &2[:updated]) != :gt))
|> Enum.map(&sanitize_entry(&1))
{:ok, %{feed | entries: entries}}
end
end
@doc """
Similar to `parse/2` but raises `ArgumentError` if unable to parse the `xml`.
## Examples
iex> ElixirRss.parse! "<rss version="2.0"><channel><title>9GAG RSS feed</title><description>Free 9GAG RSS feed</description>..."
{:ok, %{id: "http://9gagrss.com/", title: "9GAG RSS feed", description: "Free 9GAG RSS feed"...}}
iex> ElixirRss.parse! "foo"
** (ArgumentError) Not a valid XML
"""
def parse!(xml, url \\ "") do
with {:ok, feed} <- parse(xml, url) do
feed
else
_ -> raise ArgumentError, "Not a valid XML"
end
end
@doc """
Fetches the given `url` and parses the response using `parse/2`.
## Examples
iex> ElixirRss.fetch_and_parse "http://9gagrss.com/feed/"
%{id: "http://9gagrss.com/", title: "9GAG RSS feed", description: "Free 9GAG RSS feed"...}
iex> ElixirRss.fetch_and_parse "http://invalid-url"
{:error, :fetch_error}
"""
def fetch_and_parse(url) do
with {:ok, xml} <- Fetch.get(url),
{:ok, feed} <- parse(xml, url) do
{:ok, %{feed | url: url}}
end
end
# --
defp read_xml_doc(xml) do
try do
{:ok, SweetXml.parse(xml, quiet: true, namespace_conformant: true)}
rescue
FunctionClauseError -> {:error, :invalid_xml}
catch
:exit, _ -> {:error, :invalid_xml}
end
end
defp select_parser(doc) do
cond do
Atom.valid?(doc) -> {:ok, Atom}
RSS1.valid?(doc) -> {:ok, RSS1}
RSS2.valid?(doc) -> {:ok, RSS2}
true -> {:error, :unknown_feed_format}
end
end
defp sanitize_entry(entry) do
content =
(entry.content || "")
|> Sanitizer.basic_html()
%{entry | content: content}
end
end