src/sqid.gleam

import gleam/bool
import gleam/dict
import gleam/float
import gleam/int
import gleam/list
import gleam/result
import gleam/set.{type Set}
import gleam/string

/// An alphabet that can be used to encode and decode Sqids.
/// The alphabet used to encode numbers determines which characters can be used
/// in their string representation.
/// You can build one using the [`alphabet`](#alphabet) function.
///
/// ## Examples
///
/// If you want Sqids that only contains lowercase letters you can define an
/// alphabet like this:
///
/// ```gleam
/// let assert Ok(alphabet) =
///   sqid.alphabet("abcdefghijklmnopqrstuvwxyz")
/// ```
///
pub opaque type Alphabet {
  Alphabet(
    /// This goes from index to character. If the alphabet were `"abc"`, then
    /// this would be `dict.from_list([#(0, "a"), #(1, "b"), #(2, "c")])`.
    letters: dict.Dict(Int, String),
  )
}

/// The options used to encode and decode a Sqid.
/// You can create one from a valid `Alphabet` with the [`new`](#new) function.
///
pub opaque type Options {
  Options(
    /// The alphabet determining which characters can appear in the generated
    /// Sqid.
    alphabet: Alphabet,
    /// The minimum length that a Sqid must have.
    /// This value must be between 0 and 255, any value lower or higher is
    /// considered either 0 or 255.
    minimum_length: Int,
    /// A list of words that must not appear in the generated Sqid.
    blocklist: List(String),
  )
}

/// Creates a new alphabet from the letters making up the given string.
/// An alphabet is what defines what characters can end up in a generated Sqid.
///
/// This function will fail if:
/// - The alphabet has less than 3 characters
/// - The alphabet contains repeated characters
/// - The alphabet has multi-byte characters
///
/// ## Examples
///
/// If you want Sqids that only contains lowercase letters you can define an
/// alphabet like this:
///
/// ```gleam
/// let assert Ok(alphabet) =
///   sqid.alphabet("abcdefghijklmnopqrstuvwxyz")
/// ```
///
/// ```gleam
/// assert Error(Nil) == sqid.alphabet("wibble")
///   as "wrong alphabet: it contains duplicates"
///
/// assert Error(Nil) == sqid.alphabet("dziękuję")
///   as "wrong alphabet: it contains non ascii characters"
///
/// assert Error(Nil) == sqid.alphabet("01")
///   as "wrong alphabet: it contains less than 3 characters"
/// ```
///
pub fn alphabet(string: String) -> Result(Alphabet, Nil) {
  let letters = string.to_graphemes(string)
  use <- bool.guard(when: has_duplicates(letters), return: Error(Nil))
  use <- bool.guard(when: list.any(letters, is_multi_byte), return: Error(Nil))

  case letters {
    [] | [_] | [_, _] -> Error(Nil)
    _ -> {
      let letters =
        list.index_fold(letters, dict.new(), fn(acc, letter, index) {
          dict.insert(acc, index, letter)
        })
      Ok(shuffle_alphabet(Alphabet(letters:)))
    }
  }
}

fn is_multi_byte(string: String) -> Bool {
  string.byte_size(string) > 1
}

fn has_duplicates(list: List(a)) -> Bool {
  has_duplicates_loop(list, set.new())
}

fn has_duplicates_loop(list: List(a), seen_so_far: Set(a)) -> Bool {
  case list {
    [] -> False
    [first, ..rest] ->
      case set.contains(seen_so_far, first) {
        True -> True
        False -> has_duplicates_loop(rest, set.insert(seen_so_far, first))
      }
  }
}

/// Creates an `Options` object from a valid alphabet.
/// `Options` determine how Sqids are encoded and decoded.
///
/// If you want to change the generated Sqids you can also use:
/// - [`set_minimum_length`](#set_minimum_length): to pick a minimum length for
///   the generated Sqids.
/// - [`set_blocklist`](#set_blocklist): to specify a blocklist of words that
///   will not end up in any of the generated Sqids.
///
pub fn new(alphabet: Alphabet) -> Options {
  Options(alphabet:, minimum_length: 0, blocklist: [])
}

/// Sets the minimum length of the generated Sqids.
/// The value must be between 0 and 255. Any value lower than 0 is considered
/// as 0, while any value higher than 255 is considered as 255.
///
pub fn set_minimum_length(options: Options, minimum_length: Int) -> Options {
  Options(
    ..options,
    minimum_length: int.clamp(minimum_length, min: 0, max: 255),
  )
}

/// Sets the list of blocked words that cannot appear inside any of the
/// generated Sqids.
/// This is useful if you want to make sure a generated Sqid cannot contain
/// profanities.
///
/// There's a couple of things to note:
/// - The blocklist is case insensitive. If `"gleam"` is a blocked word,
///   generated Sqids will not contain any of `"gLeaM"`, `"GLEAM"`, `"GLeam"`,
///   ... and so on.
/// - All words in the blocklist must be more than three characters long, any
///   word shorter than that will be ignored.
///
pub fn set_blocklist(options: Options, blocklist: List(String)) -> Options {
  let alphabet_string = alphabet_to_string(options.alphabet)
  let blocklist =
    list.fold(blocklist, [], fn(blocklist, word) {
      // The block list is case insensitive, all words are turned to lowercase.
      let word = string.lowercase(word)
      case string.to_graphemes(word) {
        // Words that are less than three characters are ignored.
        [] | [_] | [_, _] -> blocklist
        // Otherwise they can be added. But, we can preemptively remove all the
        // strings that contain letters that are not in the alphabet.
        // Since we only generate
        letters ->
          case list.all(letters, string.contains(alphabet_string, _)) {
            True -> [word, ..blocklist]
            False -> blocklist
          }
      }
    })

  Options(..options, blocklist:)
}

/// Encodes a list of numbers into a readable string using the given options.
/// This function might fail if it's not possible to generate a string that is
/// not excluded by the options' blocklist.
///
pub fn encode(options: Options, numbers: List(Int)) -> Result(String, Nil) {
  case numbers {
    [] -> Ok("")
    _ -> do_encode(options, numbers, 0)
  }
}

fn do_encode(
  options: Options,
  numbers: List(Int),
  attempts: Int,
) -> Result(String, Nil) {
  case attempts >= dict.size(options.alphabet.letters) {
    // We've reached the maximum number of attempts to generate an id and
    // couldn't generate one!
    True -> Error(Nil)
    False -> {
      // Otherwise the algorithm works as follow: we start by shifting the
      // alphabet, and get the first separator from this shifted one.
      let alphabet_size = dict.size(options.alphabet.letters)
      use offset <- result.try(semi_random_offset(options.alphabet, numbers))
      let offset = offset % alphabet_size
      let offset = { offset + attempts } % alphabet_size
      let alphabet = shift_alphabet(options.alphabet, offset)
      let id = alphabet_separator(alphabet)

      // We then have to reverse the alphabet and iterate over all the numbers
      // encoding each one into a single id.
      let alphabet = reverse_alphabet(alphabet)
      let #(id, alphabet) = encode_numbers(numbers, alphabet, id)

      // Then we keep adding pad characters to reach the minimum character
      // limit.
      let id = pad_to_length(id, options.minimum_length, alphabet)

      // Finally we check if the id contains any word that is in the block list.
      // If it does it is discarded and we try generating a new one!
      case is_blocked_id(id, options) {
        True -> do_encode(options, numbers, attempts + 1)
        False -> Ok(id)
      }
    }
  }
}

/// Returns true if the id contains any of the words that are in the options'
/// block list.
/// Checking is case insensitive: if `"gleam"` is not allowed then `"GLeaM"`
/// would be rejected as well.
///
fn is_blocked_id(id: String, options: Options) -> Bool {
  let id = string.lowercase(id)
  let id_size = string.byte_size(id)
  list.any(options.blocklist, satisfying: fn(blocked_word) {
    let blocked_word_size = string.byte_size(blocked_word)
    case blocked_word_size > id_size {
      // If the word is longer than the id, then it's impossible for the id to
      // contain it, we keep going.
      True -> False
      // Otherwise we have to check if the blocked word is inside the id.
      // For short ids/words, we only check if they are exactly the same,
      // otherwise we would be excluding way too many ids.
      False if id_size <= 3 || blocked_word_size <= 3 -> id == blocked_word
      // Otherwise we just check as regular if the id contains the word.
      False -> string.contains(id, blocked_word)
    }
  })
}

fn encode_numbers(
  numbers: List(Int),
  alphabet: Alphabet,
  id: String,
) -> #(String, Alphabet) {
  case numbers {
    [] -> #(id, alphabet)
    [last] -> #(id <> number_to_id(last, alphabet), alphabet)
    // If the number is not the last one, we need to turn it into a string id,
    // and then add a separator character before the next one.
    [first, ..rest] -> {
      let id = id <> number_to_id(first, alphabet)
      let id = id <> alphabet_separator(alphabet)
      // Before dealing with each number we need to shuffle the alphabet.
      let alphabet = shuffle_alphabet(alphabet)
      encode_numbers(rest, alphabet, id)
    }
  }
}

fn number_to_id(number: Int, alphabet: Alphabet) -> String {
  number_to_id_loop(number, alphabet, [])
}

fn number_to_id_loop(
  number: Int,
  alphabet: Alphabet,
  acc: List(String),
) -> String {
  // The number is used as an index to pick a letter from the alphabet.
  // The only thing we need to be careful with is that the first character in
  // the alphabet (the one at index 0), is used as the separator.
  // So we can only pick letters with indices `[1, alphabet_size - 1]`.
  let alphabet_size = dict.size(alphabet.letters) - 1
  let assert Ok(letter) =
    dict.get(alphabet.letters, { number % alphabet_size } + 1)

  // We then divide the number by the alphabet size and keep going picking
  // letters until we get to 0.
  // Why do we do that? Ask the Sqid folks, that's just how this works!
  let acc = [letter, ..acc]
  let number =
    float.floor(int.to_float(number) /. int.to_float(alphabet_size))
    |> float.round

  case number > 0 {
    True -> number_to_id_loop(number, alphabet, acc)
    False -> string.join(acc, "")
  }
}

fn pad_to_length(id: String, length: Int, alphabet: Alphabet) -> String {
  let id_length = string.byte_size(id)
  case id_length < length {
    False -> id
    True -> {
      let id = id <> alphabet_separator(alphabet)
      let missing = length - id_length - 1
      pad_to_length_loop(id, missing, alphabet)
    }
  }
}

fn pad_to_length_loop(id: String, missing: Int, alphabet: Alphabet) -> String {
  case missing > 0 {
    False -> id
    True -> {
      let alphabet = shuffle_alphabet(alphabet)
      let alphabet_size = dict.size(alphabet.letters)
      let slice_size = int.min(missing, alphabet_size)
      let slice = string.slice(alphabet_to_string(alphabet), 0, slice_size)
      let id = id <> slice
      let missing = missing - slice_size
      pad_to_length_loop(id, missing, alphabet)
    }
  }
}

/// Returns the separator of the alphabet, the separator changes based on how
/// the
fn alphabet_separator(alphabet: Alphabet) -> String {
  let assert Ok(separator) = dict.get(alphabet.letters, 0) as "empty alphabet"
  separator
}

/// Shifts the alphabet by the given offset.
/// For example if alphabet is `"abcde"` and I end up shifting it by 2 to the
/// left it ends up being `"cdeab"`.
///
fn shift_alphabet(alphabet: Alphabet, offset: Int) {
  // All values after the offset need to be moved to the start of the alphabet:
  //
  //   index    0 1 2 3 4
  //   letter   a b c d e
  //                  ^ imagine offset is 3
  //
  // We need to move `d` and `e` so that they start from 0 onward, we can do
  // that by subtracting the offset from their index:
  //
  // - `d` goes to `3 - 3` -> `0`
  // - `e` goes to `4 - 3` -> `1`
  //
  // All good! With numbers that come before the offset they'll have to start
  // after all the number that we've moved. How many numbers are we moving?
  // All the numbers that go from `offset` to the end of the alphabet; in this
  // example if would be just two (`d` and `e`). In general that's given by
  // `alphabet_size - offset`.
  //
  // - We can check this works for our example: we're moving
  //   `alphabet_size - offset` -> `5 - 3` -> `2` characters
  // - `a` goes to `0 + 2` -> `2`
  // - `b` goes to `1 + 2` -> `3`
  // - `c` goes to `2 + 2` -> `4`
  let alphabet_size = dict.size(alphabet.letters)
  let letters =
    dict.fold(alphabet.letters, dict.new(), fn(letters, index, letter) {
      let new_index = case index < offset {
        True -> alphabet_size - offset + index
        False -> index - offset
      }
      dict.insert(letters, new_index, letter)
    })
  Alphabet(letters:)
}

/// This computes an offset from a list of numbers, the Sqid spec calls this
/// "semi random": it's random looking but it's actually deterministically
/// decided by the numbers and alphabet we pass as arguments.
/// This returns an error if any of the numbers is negative.
fn semi_random_offset(
  alphabet: Alphabet,
  numbers: List(Int),
) -> Result(Int, Nil) {
  let alphabet_size = dict.size(alphabet.letters)
  list.try_fold(numbers, #(list.length(numbers), 0), fn(pair, number) {
    let #(acc, index) = pair
    case number < 0 {
      True -> Error(Nil)
      False -> {
        let assert Ok(letter_value) =
          letter_value(alphabet.letters, number % alphabet_size)
          as "index outside of dictionary"

        Ok(#(letter_value + index + acc, index + 1))
      }
    }
  })
  |> result.map(fn(pair) { pair.0 })
}

/// Given a dictionary of letters this returns the codepoint value of the letter
/// at the given index.
/// This would fail if:
/// - the index is not included in the dictionary
/// - the dictionary value is an empty string
///
/// We only ever use this internally after validating an alphabet and making
/// sure that all the entries are single letters, so under this assumption it
/// should never fail.
fn letter_value(
  letters: dict.Dict(Int, String),
  index: Int,
) -> Result(Int, Nil) {
  dict.get(letters, index)
  |> result.map(string.to_utf_codepoints)
  |> result.try(list.first)
  |> result.map(string.utf_codepoint_to_int)
}

/// Shuffles an alphabet.
/// Don't be fooled by the name: the shuffle might look random, but it's
/// actually deterministic. Given an alphabet its shuffled version is always
/// going to be the same.
///
fn shuffle_alphabet(alphabet: Alphabet) -> Alphabet {
  let letters =
    shuffle_alphabet_loop(alphabet.letters, 0, dict.size(alphabet.letters) - 1)
  Alphabet(letters:)
}

fn shuffle_alphabet_loop(
  letters: dict.Dict(Int, String),
  i: Int,
  j: Int,
) -> dict.Dict(Int, String) {
  case j <= 0 {
    True -> letters
    False -> {
      let assert Ok(i_value) = letter_value(letters, i) as "i always in range"
      let assert Ok(j_value) = letter_value(letters, j) as "j always in range"
      let r = { i * j + i_value + j_value } % dict.size(letters)

      let assert Ok(old_i) = dict.get(letters, i) as "i always in range"
      let assert Ok(old_r) = dict.get(letters, r) as "r always in range"
      let letters = dict.insert(letters, i, old_r) |> dict.insert(r, old_i)
      shuffle_alphabet_loop(letters, i + 1, j - 1)
    }
  }
}

/// This reverses an alphabet: if the initial alphabet is "abcd", the reversed
/// alphabet is going to be "dcba".
fn reverse_alphabet(alphabet: Alphabet) -> Alphabet {
  let size = dict.size(alphabet.letters)
  let letters =
    dict.fold(alphabet.letters, dict.new(), fn(letters, index, letter) {
      dict.insert(letters, size - 1 - index, letter)
    })

  Alphabet(letters:)
}

fn alphabet_to_string(alphabet: Alphabet) -> String {
  dict.to_list(alphabet.letters)
  |> list.sort(fn(one, other) { int.compare(one.0, other.0) })
  |> list.map(fn(pair) { pair.1 })
  |> string.join(with: "")
}

// --- DECODING ----------------------------------------------------------------

/// Decodes a Sqid into the list of integers that was used to generate it.
/// To decode a Sqid, always use the same options that were used to encode it,
/// otherwise you will get nonsense results!
///
/// This function will fail if the Sqid has carachters that are not allowed by
/// these options, meaning that the Sqid was encoded with a different alphabet.
///
pub fn decode(options: Options, sqid: String) -> Result(List(Int), Nil) {
  case string.pop_grapheme(sqid) {
    // The string is empty, so it was encoded from the empty list
    Error(_) -> Ok([])
    // Otherwise we use the prefix to get back to the original offset we used to
    // shift the alphabet, so we can shift it back into its original form.
    Ok(#(separator, sqid)) ->
      case index(alphabet_to_string(options.alphabet), of: separator) {
        // The Sqid's character is not in the alphabet. This means it was
        // encoded using a different alphabet.
        Error(_) -> Error(Nil)
        Ok(offset) ->
          shift_alphabet(options.alphabet, offset)
          |> reverse_alphabet
          |> decode_loop(sqid, [])
      }
  }
}

fn decode_loop(
  alphabet: Alphabet,
  sqid: String,
  numbers: List(Int),
) -> Result(List(Int), Nil) {
  let separator = alphabet_separator(alphabet)
  case string.split(sqid, separator) {
    // If the string is empty, or the first chunk is empty we're done (the rest
    // are junk padding characters).
    [] | ["", ..] -> Ok(list.reverse(numbers))
    // If the chunk is empty we're done, the rest are junk padding characters.
    [id, ..rest] ->
      case id_to_number(id, alphabet) {
        Error(_) -> Error(Nil)
        Ok(number) -> {
          let numbers = [number, ..numbers]
          let alphabet = shuffle_alphabet(alphabet)
          let sqid = string.join(rest, with: separator)
          decode_loop(alphabet, sqid, numbers)
        }
      }
  }
}

fn id_to_number(id: String, alphabet: Alphabet) -> Result(Int, Nil) {
  let alphabet_size = dict.size(alphabet.letters) - 1
  let alphabet_string = alphabet_to_string(alphabet)

  list.try_fold(string.to_graphemes(id), 0, fn(acc, letter) {
    use index <- result.try(index(alphabet_string, of: letter))
    Ok(acc * alphabet_size + index - 1)
  })
}

fn index(string: String, of separator: String) -> Result(Int, Nil) {
  use #(before, _) <- result.try(string.split_once(string, on: separator))
  Ok(string.byte_size(before))
}