docs/performance-tuning.md

Select File:
docs/performance-tuning.md

# Performance Tuning

Optimizing TantivyEx for production requires understanding how indexing, searching, and memory usage affect performance.

## Index Design for Performance

### Choose the Right Field Options

Different field options have different performance characteristics:

```elixir
# For fields you only search (no retrieval needed)
Schema.add_text_field(schema, "content", :text)

# For fields you search and retrieve
Schema.add_text_field(schema, "title", :text_stored)

# For fast filtering and aggregation
Schema.add_u64_field(schema, "timestamp", :fast)

# For both retrieval and fast operations
Schema.add_f64_field(schema, "price", :fast_stored)
```

**Performance Guidelines:**

- Use `:text` for content you only search, not retrieve
- Use `:fast` for fields used in range queries or sorting
- Use `_stored` variants only when you need to retrieve the original value
- Avoid storing large text fields if you don't need them in results

### Optimize Your Schema

```elixir
# ❌ Poor performance - storing large content unnecessarily
schema = Schema.add_text_field(schema, "full_content", :text_stored)

# ✅ Better - only index for search
schema = Schema.add_text_field(schema, "full_content", :text)

# Store a separate summary field for display
schema = Schema.add_text_field(schema, "summary", :text_stored)
```

### Field Type Selection Impact

```elixir
defmodule SchemaOptimizer do
  def create_optimized_schema() do
    schema = Schema.new()

    # Text fields - choose based on use case
    schema = Schema.add_text_field(schema, "title", :text_stored)     # Search + display
    schema = Schema.add_text_field(schema, "content", :text)          # Search only
    schema = Schema.add_text_field(schema, "summary", :stored)        # Display only

    # Numeric fields - optimize for operations
    schema = Schema.add_u64_field(schema, "timestamp", :fast)         # Filtering/sorting
    schema = Schema.add_f64_field(schema, "price", :fast_stored)      # Filter + display
    schema = Schema.add_u64_field(schema, "view_count", :stored)      # Display only

    # Facet fields - for navigation
    {:ok, schema} = Schema.add_facet_field(schema, "category", :facet)

    {:ok, schema}
  end
end
```

## Indexing Performance

### Batch Operations

Always prefer batch operations over individual document additions:

```elixir
# ❌ Slow - individual commits
{:ok, writer} = TantivyEx.IndexWriter.new(index)
Enum.each(documents, fn doc ->
  TantivyEx.IndexWriter.add_document(writer, doc)
  TantivyEx.IndexWriter.commit(writer)  # Don't do this!
end)

# ✅ Fast - batch commit
{:ok, writer} = TantivyEx.IndexWriter.new(index)
Enum.each(documents, fn doc ->
  TantivyEx.IndexWriter.add_document(writer, doc)
end)
TantivyEx.IndexWriter.commit(writer)  # Single commit at the end
```

### Optimize Commit Frequency

```elixir
defmodule BulkIndexer do
  @batch_size 1000
  @commit_interval_ms 5000

  def index_documents(index, documents) do
    {:ok, writer} = TantivyEx.IndexWriter.new(index)

    documents
    |> Enum.chunk_every(@batch_size)
    |> Enum.each(fn batch ->
      add_batch(writer, batch)
      TantivyEx.IndexWriter.commit(writer)

      # Optional: brief pause to prevent overwhelming the system
      Process.sleep(100)
    end)
  end

  defp add_batch(writer, documents) do
    Enum.each(documents, fn doc ->
      case TantivyEx.IndexWriter.add_document(writer, doc) do
        :ok -> :ok
        {:error, reason} ->
          Logger.warning("Failed to add document: #{inspect(reason)}")
      end
    end)
  end
end
```

### Parallel Indexing

```elixir
defmodule ParallelIndexer do
  def index_documents_parallel(index, documents, num_workers \\ 4) do
    documents
    |> Enum.chunk_every(div(length(documents), num_workers))
    |> Task.async_stream(fn chunk ->
      {:ok, writer} = TantivyEx.IndexWriter.new(index)

      Enum.each(chunk, fn doc ->
        TantivyEx.IndexWriter.add_document(writer, doc)
      end)

      TantivyEx.IndexWriter.commit(writer)
    end, timeout: 60_000)
    |> Enum.to_list()
  end
end
```

## Search Performance

### Query Optimization

```elixir
# ❌ Slow - overly broad queries
{:ok, searcher} = TantivyEx.Searcher.new(index)
TantivyEx.Searcher.search(searcher, "*", 10000)

# ✅ Fast - specific queries with reasonable limits
TantivyEx.Searcher.search(searcher, "specific terms", 50)

# ❌ Slow - complex boolean queries without field targeting
TantivyEx.Searcher.search(searcher, "(a OR b OR c) AND (d OR e OR f)", 100)

# ✅ Fast - field-specific queries
TantivyEx.Searcher.search(searcher, "title:(important terms) AND category:specific", 100)
```

### Result Limiting and Pagination

```elixir
defmodule SearchOptimizer do
  # Don't retrieve more results than you need
  def search_with_limit(index, query, limit \\ 20) do
    {:ok, searcher} = TantivyEx.Searcher.new(index)
    TantivyEx.Searcher.search(searcher, query, limit)
  end

  # Efficient pagination for moderate depths
  def paginated_search(index, query, page, per_page) when page <= 100 do
    limit = page * per_page
    {:ok, searcher} = TantivyEx.Searcher.new(index)

    case TantivyEx.Searcher.search(searcher, query, limit) do
      {:ok, all_results} ->
        start_index = (page - 1) * per_page
        page_results = Enum.slice(all_results, start_index, per_page)
        {:ok, page_results}

      error -> error
    end
  end

  # For deep pagination, consider cursor-based approaches
  def cursor_based_search(index, query, cursor, per_page) do
    # Implementation depends on your specific use case
    # Consider using a timestamp or ID field for cursor
    enhanced_query = "#{query} AND timestamp:>#{cursor}"
    search_with_limit(index, enhanced_query, per_page)
  end
end
```

### Query Caching

```elixir
defmodule QueryCache do
  use GenServer

  # Simple in-memory cache for frequent queries
  def start_link(_opts) do
    GenServer.start_link(__MODULE__, %{}, name: __MODULE__)
  end

  def search_cached(index, query, limit) do
    cache_key = {query, limit}

    case GenServer.call(__MODULE__, {:get, cache_key}) do
      nil ->
        {:ok, results} = search_and_cache(index, query, limit, cache_key)
        results

      cached_results ->
        cached_results
    end
  end

  defp search_and_cache(index, query, limit, cache_key) do
    {:ok, searcher} = TantivyEx.Searcher.new(index)

    case TantivyEx.Searcher.search(searcher, query, limit) do
      {:ok, results} = success ->
        GenServer.cast(__MODULE__, {:put, cache_key, results})
        success

      error -> error
    end
  end

  # GenServer callbacks
  def init(state), do: {:ok, state}

  def handle_call({:get, key}, _from, cache) do
    {:reply, Map.get(cache, key), cache}
  end

  def handle_cast({:put, key, value}, cache) do
    # Simple cache with size limit
    new_cache =
      cache
      |> Map.put(key, value)
      |> maybe_evict_old_entries()

    {:noreply, new_cache}
  end

  defp maybe_evict_old_entries(cache) when map_size(cache) > 1000 do
    # Keep only the most recent 500 entries
    cache
    |> Enum.take(500)
    |> Map.new()
  end

  defp maybe_evict_old_entries(cache), do: cache
end
```

## Memory Management

### Index Size Monitoring

```elixir
defmodule IndexMonitor do
  require Logger

  def check_index_stats(index_path) do
    case File.stat(index_path) do
      {:ok, %{size: size}} ->
        size_mb = size / (1024 * 1024)
        Logger.info("Index size: #{Float.round(size_mb, 2)} MB")
        {:ok, size_mb}

      {:error, reason} ->
        Logger.error("Could not get index stats: #{reason}")
        {:error, reason}
    end
  end

  def monitor_index_growth(index_path, threshold_mb \\ 1000) do
    case check_index_stats(index_path) do
      {:ok, size_mb} when size_mb > threshold_mb ->
        Logger.warning("Index size (#{size_mb} MB) exceeds threshold (#{threshold_mb} MB)")
        :threshold_exceeded

      {:ok, _size_mb} ->
        :ok

      error -> error
    end
  end
end
```

### RAM vs Disk Indexes

```elixir
defmodule IndexStrategy do
  def choose_index_type(dataset_size_mb, available_ram_mb) do
    cond do
      dataset_size_mb < 100 and available_ram_mb > 1000 ->
        {:ram_index, "Small dataset, use RAM for speed"}

      dataset_size_mb < available_ram_mb * 0.5 ->
        {:ram_index, "Dataset fits comfortably in RAM"}

      true ->
        {:disk_index, "Dataset too large for RAM or limited memory"}
    end
  end

  def create_optimized_index(schema, strategy, path \\ nil) do
    case strategy do
      {:ram_index, _reason} ->
        Index.create_in_ram(schema)

      {:disk_index, _reason} ->
        path = path || generate_temp_path()
        Index.create_in_dir(path, schema)
    end
  end

  defp generate_temp_path do
    timestamp = System.system_time(:second)
    "/tmp/tantivy_index_#{timestamp}"
  end
end
```

## Performance Benchmarking

```elixir
defmodule PerformanceBenchmark do
  def benchmark_indexing(documents, batch_sizes \\ [100, 500, 1000, 5000]) do
    schema = create_test_schema()

    Enum.map(batch_sizes, fn batch_size ->
      {time, _result} = :timer.tc(fn ->
        index_with_batch_size(documents, schema, batch_size)
      end)

      time_ms = time / 1000
      docs_per_second = length(documents) / (time_ms / 1000)

      %{
        batch_size: batch_size,
        time_ms: time_ms,
        docs_per_second: Float.round(docs_per_second, 2)
      }
    end)
  end

  def benchmark_queries(index, queries) do
    {:ok, searcher} = TantivyEx.Searcher.new(index)

    Enum.map(queries, fn query ->
      {time, result} = :timer.tc(fn ->
        TantivyEx.Searcher.search(searcher, query, 100)
      end)

      time_ms = time / 1000
      result_count = case result do
        {:ok, results} -> length(results)
        _ -> 0
      end

      %{
        query: query,
        time_ms: time_ms,
        result_count: result_count
      }
    end)
  end

  defp create_test_schema do
    {:ok, schema} = Schema.new()
    {:ok, schema} = Schema.add_text_field(schema, "title", :text_stored)
    {:ok, schema} = Schema.add_text_field(schema, "content", :text)
    {:ok, schema} = Schema.add_u64_field(schema, "timestamp", :fast)
    schema
  end

  defp index_with_batch_size(documents, schema, batch_size) do
    {:ok, index} = Index.create_in_ram(schema)
    {:ok, writer} = TantivyEx.IndexWriter.new(index)

    documents
    |> Enum.chunk_every(batch_size)
    |> Enum.each(fn batch ->
      Enum.each(batch, &TantivyEx.IndexWriter.add_document(writer, &1))
      TantivyEx.IndexWriter.commit(writer)
    end)

    index
  end
end
```

## Performance Best Practices Summary

### Do's ✅

- Batch document operations
- Use appropriate field types and options
- Monitor index size and performance
- Cache frequent queries
- Use specific, targeted queries
- Profile your application's search patterns

### Don'ts ❌

- Don't commit after every document
- Don't store fields you don't need to retrieve
- Don't use overly broad queries (`*`)
- Don't request more results than needed
- Don't ignore memory usage patterns
- Don't skip performance testing

### Monitoring in Production

```elixir
defmodule ProductionMonitoring do
  use GenServer
  require Logger

  def start_link(index_path) do
    GenServer.start_link(__MODULE__, %{index_path: index_path}, name: __MODULE__)
  end

  def init(state) do
    schedule_monitoring()
    {:ok, state}
  end

  def handle_info(:monitor, %{index_path: index_path} = state) do
    case IndexMonitor.check_index_stats(index_path) do
      {:ok, size_mb} ->
        :telemetry.execute([:tantivy_ex, :index, :size], %{megabytes: size_mb})

      {:error, reason} ->
        Logger.error("Index monitoring failed: #{inspect(reason)}")
    end

    schedule_monitoring()
    {:noreply, state}
  end

  defp schedule_monitoring do
    Process.send_after(self(), :monitor, 60_000)  # Every minute
  end
end
```