native/whisper_ct2_native/src/tokens.rs

Select File
native/whisper_ct2_native/src/tokens.rs

#![allow(clippy::cast_precision_loss)]

use anyhow::{Result, anyhow};
use ct2rs::tokenizers::hf;
use tokenizers::Tokenizer as InnerTokenizer;

/// IDs of every Whisper special token we need to interpret the generated
/// sequence. Resolved once at model load by querying the HF tokenizer.
pub(crate) struct SpecialTokens {
    /// `<|startoftranscript|>`
    pub(crate) sot: u32,
    /// `<|transcribe|>`
    pub(crate) transcribe: u32,
    /// `<|notimestamps|>`
    pub(crate) no_timestamps: u32,
    /// `<|0.00|>` — base of the 1501 timestamp-token range. Any token ID
    /// `>= timestamp_begin` is a timestamp; its value is
    /// `(id - timestamp_begin) * 0.02` seconds.
    pub(crate) timestamp_begin: u32,
}

impl SpecialTokens {
    pub(crate) fn resolve(inner: &InnerTokenizer) -> Result<Self> {
        // Whisper's timestamp tokens (`<|0.00|>` ..= `<|30.00|>`) are NOT
        // in the tokenizer vocab; they live in the model output space
        // immediately after `<|notimestamps|>`, matching faster-whisper's
        // convention: `timestamp_begin = no_timestamps_id + 1`. We
        // additionally probe `<|startofprev|>` because `initial_prompt`
        // expects it to exist even though we never store the ID here.
        let no_timestamps = lookup(inner, NO_TIMESTAMPS)?;
        Ok(Self {
            sot: lookup(inner, SOT)?,
            transcribe: lookup(inner, TRANSCRIBE)?,
            no_timestamps,
            timestamp_begin: no_timestamps + 1,
        })
    }
}

// Whisper special-token literals used in prompt construction. Centralised
// so the prompt builder and `SpecialTokens::resolve` cannot drift.
pub(crate) const SOT: &str = "<|startoftranscript|>";
pub(crate) const STARTOFPREV: &str = "<|startofprev|>";
pub(crate) const TRANSCRIBE: &str = "<|transcribe|>";
pub(crate) const NO_TIMESTAMPS: &str = "<|notimestamps|>";

fn lookup(inner: &InnerTokenizer, token: &str) -> Result<u32> {
    inner
        .token_to_id(token)
        .ok_or_else(|| anyhow!("special token {token} missing from tokenizer vocab"))
}

pub(crate) fn language_token(inner: &InnerTokenizer, code: &str) -> Result<String> {
    let token = format!("<|{code}|>");
    if inner.token_to_id(&token).is_none() {
        return Err(anyhow!("language token {token} not in vocab"));
    }
    Ok(token)
}

pub(crate) fn token_id(inner: &InnerTokenizer, token: &str) -> Result<u32> {
    inner
        .token_to_id(token)
        .ok_or_else(|| anyhow!("token {token} missing from tokenizer vocab"))
}

pub(crate) fn encode_plain(tokenizer: &hf::Tokenizer, text: &str) -> Result<Vec<String>> {
    let inner: &InnerTokenizer = tokenizer;
    let encoding = inner
        .encode(text, false)
        .map_err(|e| anyhow!("failed to tokenize prompt text: {e}"))?;
    Ok(encoding.get_tokens().to_vec())
}

/// Builds the per-chunk prompt vector that `sys::Whisper::generate` consumes.
///
/// Layout:
///
/// ```text
/// [<|startofprev|> <initial_prompt_tokens>]? <|startoftranscript|> <|lang|>
///   <|transcribe|> [<|notimestamps|>]? <prefix_tokens>*
/// ```
///
/// `with_timestamps` controls whether `<|notimestamps|>` is appended; when
/// `false`, the model emits `<|t_..|>` tokens we parse back out.
pub(crate) struct PromptParts<'a> {
    pub(crate) sot: &'a str,
    pub(crate) startofprev: &'a str,
    pub(crate) language_token: &'a str,
    pub(crate) transcribe: &'a str,
    pub(crate) no_timestamps: &'a str,
    pub(crate) initial_prompt: &'a [String],
    pub(crate) prefix: &'a [String],
    pub(crate) with_timestamps: bool,
    /// `false` for English-only checkpoints (`*.en`): the SOT block is
    /// just `<|startoftranscript|>`. Multilingual checkpoints append the
    /// language and `<|transcribe|>` tokens, matching faster-whisper.
    pub(crate) multilingual: bool,
}

impl PromptParts<'_> {
    pub(crate) fn build(&self) -> Vec<String> {
        let mut out: Vec<String> =
            Vec::with_capacity(self.initial_prompt.len() + self.prefix.len() + 5);
        if !self.initial_prompt.is_empty() {
            out.push(self.startofprev.to_owned());
            out.extend(self.initial_prompt.iter().cloned());
        }
        out.push(self.sot.to_owned());
        if self.multilingual {
            out.push(self.language_token.to_owned());
            out.push(self.transcribe.to_owned());
        }
        if !self.with_timestamps {
            out.push(self.no_timestamps.to_owned());
        }
        out.extend(self.prefix.iter().cloned());
        out
    }
}

/// One `<|start_ts|> text... <|end_ts|>` sub-segment carved out of a chunk's
/// generated token IDs. Offsets are relative to the chunk; the caller adds
/// the chunk's start time to produce absolute audio time.
#[derive(Debug)]
pub(crate) struct SubSegment {
    pub(crate) text_token_ids: Vec<u32>,
    pub(crate) start_in_chunk: f32,
    pub(crate) end_in_chunk: f32,
}

/// Parses a generated chunk's token IDs into `<|t_start|> text <|t_end|>`
/// sub-segments. Token IDs `>= timestamp_begin` are treated as timestamps;
/// anything before the first timestamp pair is discarded as preamble.
///
/// `chunk_duration_s` is the wall-clock length of the Whisper window
/// (30 s for every published checkpoint). It is used as the fallback
/// `end_in_chunk` in two situations:
///
/// 1. **Unclosed pair**: the model emitted `<|t_start|> text [EOT]` with
///    no closing timestamp. Some fine-tunes (notably notebotIE Swiss-German)
///    only reliably emit the opening timestamp.
/// 2. **No timestamps at all**: the prompt asked for `<|notimestamps|>`,
///    or the fine-tune ignored the timestamp instruction and emitted
///    plain text. The whole token stream becomes one sub-segment
///    spanning `[0, chunk_duration_s)`.
///
/// Faster-whisper handles both cases the same way; dropping the text
/// silently is how multi-second turns turned into empty transcripts.
pub(crate) fn split_sub_segments(
    token_ids: &[u32],
    timestamp_begin: u32,
    chunk_duration_s: f32,
) -> Vec<SubSegment> {
    let mut out = Vec::new();
    let mut i = 0;
    let mut saw_first_timestamp = false;

    while i < token_ids.len() {
        let preamble_start = i;

        while i < token_ids.len() && token_ids[i] < timestamp_begin {
            i += 1;
        }

        if i >= token_ids.len() {
            // No timestamps in this entire chunk. Flush every token as
            // one sub-segment covering the whole chunk window — without
            // this, `<|notimestamps|>` mode (or any fine-tune that just
            // refuses to emit timestamps) would lose all of its output.
            if !saw_first_timestamp && preamble_start < token_ids.len() {
                out.push(SubSegment {
                    text_token_ids: token_ids[preamble_start..].to_vec(),
                    start_in_chunk: 0.0,
                    end_in_chunk: chunk_duration_s,
                });
            }
            break;
        }

        saw_first_timestamp = true;
        let start_id = token_ids[i];
        let text_start = i + 1;
        i += 1;

        while i < token_ids.len() && token_ids[i] < timestamp_begin {
            i += 1;
        }

        if i >= token_ids.len() {
            // Unclosed pair: model emitted `<|t_start|> text [EOT]` with
            // no closing timestamp. Flush the pending text with the
            // chunk window's end as the fallback boundary instead of
            // silently dropping it.
            if text_start < token_ids.len() {
                out.push(SubSegment {
                    text_token_ids: token_ids[text_start..].to_vec(),
                    start_in_chunk: timestamp_seconds(start_id, timestamp_begin),
                    end_in_chunk: chunk_duration_s,
                });
            }
            break;
        }

        let end_id = token_ids[i];
        let text_end = i;
        i += 1;

        if text_start >= text_end {
            continue;
        }

        out.push(SubSegment {
            text_token_ids: token_ids[text_start..text_end].to_vec(),
            start_in_chunk: timestamp_seconds(start_id, timestamp_begin),
            end_in_chunk: timestamp_seconds(end_id, timestamp_begin),
        });
    }
    out
}

#[inline]
fn timestamp_seconds(token_id: u32, timestamp_begin: u32) -> f32 {
    (token_id - timestamp_begin) as f32 * 0.02
}

/// Decodes a flat list of text-only token IDs to a single string.
pub(crate) fn decode_ids(tokenizer: &hf::Tokenizer, ids: &[u32]) -> Result<String> {
    let inner: &InnerTokenizer = tokenizer;
    inner
        .decode(ids, true)
        .map_err(|e| anyhow!("failed to decode tokens: {e}"))
}

#[cfg(test)]
mod tests {
    use super::*;

    const BEGIN: u32 = 50_000;
    const CHUNK_S: f32 = 30.0;

    fn ts(offset: u32) -> u32 {
        BEGIN + offset
    }

    #[test]
    fn split_sub_segments_returns_empty_for_no_tokens() {
        assert!(split_sub_segments(&[], BEGIN, CHUNK_S).is_empty());
    }

    #[test]
    fn split_sub_segments_discards_preamble_before_first_timestamp() {
        let out = split_sub_segments(&[10, 20, ts(0), 100, 101, ts(100)], BEGIN, CHUNK_S);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text_token_ids, vec![100, 101]);
        assert!((out[0].start_in_chunk - 0.0).abs() < 1e-6);
        assert!((out[0].end_in_chunk - 2.0).abs() < 1e-6);
    }

    #[test]
    fn split_sub_segments_handles_back_to_back_pairs() {
        let out = split_sub_segments(&[ts(0), 100, ts(50), ts(50), 200, ts(150)], BEGIN, CHUNK_S);
        assert_eq!(out.len(), 2);
        assert_eq!(out[0].text_token_ids, vec![100]);
        assert!((out[0].end_in_chunk - 1.0).abs() < 1e-6);
        assert_eq!(out[1].text_token_ids, vec![200]);
        assert!((out[1].start_in_chunk - 1.0).abs() < 1e-6);
        assert!((out[1].end_in_chunk - 3.0).abs() < 1e-6);
    }

    #[test]
    fn split_sub_segments_skips_pairs_with_empty_text() {
        let out = split_sub_segments(&[ts(0), ts(50)], BEGIN, CHUNK_S);
        assert!(out.is_empty());
    }

    #[test]
    fn split_sub_segments_flushes_all_text_when_no_timestamps_emitted() {
        // `<|notimestamps|>` mode or fine-tunes that just refuse to emit
        // timestamps: the whole token stream becomes one segment spanning
        // [0, chunk_duration_s).
        let out = split_sub_segments(&[100, 101, 102, 103], BEGIN, CHUNK_S);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text_token_ids, vec![100, 101, 102, 103]);
        assert!((out[0].start_in_chunk - 0.0).abs() < 1e-6);
        assert!((out[0].end_in_chunk - CHUNK_S).abs() < 1e-6);
    }

    #[test]
    fn split_sub_segments_flushes_text_after_unclosed_start_timestamp() {
        // Some fine-tunes emit `<|t_start|> text [EOT]` without a closing
        // timestamp. The text must be flushed with `chunk_duration_s` as
        // the fallback end, not dropped.
        let out = split_sub_segments(&[ts(0), 100, 101, 102], BEGIN, CHUNK_S);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text_token_ids, vec![100, 101, 102]);
        assert!((out[0].start_in_chunk - 0.0).abs() < 1e-6);
        assert!((out[0].end_in_chunk - CHUNK_S).abs() < 1e-6);
    }

    #[test]
    fn split_sub_segments_flushes_trailing_text_after_closed_pair() {
        // Mixed case: one balanced pair followed by an unclosed
        // `<|t_start|> text` tail. Both must appear in the output.
        let out = split_sub_segments(&[ts(0), 100, ts(50), ts(60), 200, 201], BEGIN, CHUNK_S);
        assert_eq!(out.len(), 2);
        assert_eq!(out[0].text_token_ids, vec![100]);
        assert_eq!(out[1].text_token_ids, vec![200, 201]);
        assert!((out[1].start_in_chunk - 1.2).abs() < 1e-6);
        assert!((out[1].end_in_chunk - CHUNK_S).abs() < 1e-6);
    }

    #[test]
    fn split_sub_segments_drops_lone_dangling_start_timestamp() {
        // `<|t_start|>` immediately followed by EOT (no text) still
        // produces nothing — there is nothing to flush.
        let out = split_sub_segments(&[ts(0)], BEGIN, CHUNK_S);
        assert!(out.is_empty());
    }

    #[test]
    fn timestamp_seconds_uses_two_centisecond_step() {
        assert!((timestamp_seconds(BEGIN, BEGIN) - 0.0).abs() < 1e-6);
        assert!((timestamp_seconds(BEGIN + 1, BEGIN) - 0.02).abs() < 1e-6);
        assert!((timestamp_seconds(BEGIN + 1500, BEGIN) - 30.0).abs() < 1e-4);
    }

    // PromptParts uses the literal token strings from this module, not
    // numeric ids, so we can build prompts without a loaded tokenizer.
    fn parts<'a>(
        initial_prompt: &'a [String],
        prefix: &'a [String],
        with_timestamps: bool,
        multilingual: bool,
        lang: &'a str,
    ) -> PromptParts<'a> {
        PromptParts {
            sot: SOT,
            startofprev: STARTOFPREV,
            language_token: lang,
            transcribe: TRANSCRIBE,
            no_timestamps: NO_TIMESTAMPS,
            initial_prompt,
            prefix,
            with_timestamps,
            multilingual,
        }
    }

    #[test]
    fn prompt_english_only_no_timestamps() {
        // .en checkpoints: SOT block is just `<|startoftranscript|>`,
        // then `<|notimestamps|>`. No lang or `<|transcribe|>`.
        let p = parts(&[], &[], false, false, "<|en|>");
        assert_eq!(p.build(), vec![SOT.to_owned(), NO_TIMESTAMPS.to_owned()]);
    }

    #[test]
    fn prompt_multilingual_with_timestamps() {
        // Multilingual + with_timestamps: SOT, lang, transcribe, no
        // `<|notimestamps|>` because the model must emit timestamps.
        let p = parts(&[], &[], true, true, "<|de|>");
        assert_eq!(
            p.build(),
            vec![SOT.to_owned(), "<|de|>".to_owned(), TRANSCRIBE.to_owned()]
        );
    }

    #[test]
    fn prompt_with_initial_prompt_prepends_startofprev() {
        let initial = vec!["hello".to_owned(), "world".to_owned()];
        let p = parts(&initial, &[], false, true, "<|en|>");
        assert_eq!(
            p.build(),
            vec![
                STARTOFPREV.to_owned(),
                "hello".to_owned(),
                "world".to_owned(),
                SOT.to_owned(),
                "<|en|>".to_owned(),
                TRANSCRIBE.to_owned(),
                NO_TIMESTAMPS.to_owned(),
            ]
        );
    }

    #[test]
    fn prompt_with_prefix_appended_after_sot_block() {
        let prefix = vec!["The".to_owned(), "topic".to_owned()];
        let p = parts(&[], &prefix, false, true, "<|en|>");
        assert_eq!(
            p.build(),
            vec![
                SOT.to_owned(),
                "<|en|>".to_owned(),
                TRANSCRIBE.to_owned(),
                NO_TIMESTAMPS.to_owned(),
                "The".to_owned(),
                "topic".to_owned(),
            ]
        );
    }
}