Skip to main content

priv/native/ios/mob_screencast_nif.m

/* mob_screencast_nif — iOS screen-capture tier-1 plugin NIF (Objective-C).
 *
 * The iOS counterpart to priv/native/jni/mob_screencast_nif.zig: captures the
 * device's own screen with ReplayKit's in-app RPScreenRecorder (per-session user
 * consent, no broadcast extension), hardware-encodes it to H264 with a
 * VideoToolbox VTCompressionSession, converts the encoder's AVCC output to
 * Annex-B (start-code) NAL units (SPS/PPS prepended to keyframes), and pushes each
 * access unit to the BEAM as:
 *
 *   {:screencast, :frame, #{bytes, width, height, format: :h264, timestamp_ms, keyframe}}
 *
 * exactly like the Android bridge's nativeDeliverScreencastFrame. Consent outcome
 * is reported as {:screencast, :permission, :granted | :denied}.
 *
 * Compiled as ObjC (-fobjc-arc) via the plugin objc-NIF path (manifest lang: :objc).
 * Under ARC, CoreFoundation / CoreMedia / VideoToolbox objects are NOT managed — they
 * are released manually with CFRelease.
 */
#import <CoreMedia/CoreMedia.h>
#import <CoreVideo/CoreVideo.h>
#import <Foundation/Foundation.h>
#import <ReplayKit/ReplayKit.h>
#import <VideoToolbox/VideoToolbox.h>
#include <erl_nif.h>

// ── Capture session state ───────────────────────────────────────────────────
// All VTCompressionSession lifecycle (create / encode / teardown) is serialized
// onto g_sc_queue so start_stream, the ReplayKit sample handler, and stop_stream
// never race on g_vt_session. The VT output callback runs on a VideoToolbox
// thread and only calls enif_send (documented thread-safe).
static ErlNifPid g_sc_pid;
static BOOL g_sc_have_pid = NO;
static VTCompressionSessionRef g_vt_session = NULL;
static dispatch_queue_t g_sc_queue = NULL;
static int g_sc_bitrate = 2000000;
static int g_sc_fps = 30;
static int g_sc_keyframe_interval_ms = 2000;
static int g_sc_max_size = 0; // honored on Android; iOS encodes at native res (TODO)
static int g_enc_w = 0;
static int g_enc_h = 0;
static BOOL g_sc_force_keyframe = NO;

static const uint8_t kAnnexBStartCode[4] = {0x00, 0x00, 0x00, 0x01};

static dispatch_queue_t sc_queue(void) {
    static dispatch_once_t once;
    dispatch_once(&once, ^{
      g_sc_queue = dispatch_queue_create("io.mob.screencast.session", DISPATCH_QUEUE_SERIAL);
    });
    return g_sc_queue;
}

static void sc_send_permission(const char *status) {
    if (!g_sc_have_pid)
        return;
    ErlNifEnv *e = enif_alloc_env();
    ERL_NIF_TERM msg = enif_make_tuple3(e, enif_make_atom(e, "screencast"),
                                        enif_make_atom(e, "permission"),
                                        enif_make_atom(e, status));
    enif_send(NULL, &g_sc_pid, e, msg);
    enif_free_env(e);
}

// ── VideoToolbox output callback: AVCC -> Annex-B, deliver per access unit ───
static void sc_vt_output(void *outputCallbackRefCon, void *sourceFrameRefCon, OSStatus status,
                         VTEncodeInfoFlags infoFlags, CMSampleBufferRef sampleBuffer) {
    (void)outputCallbackRefCon;
    (void)sourceFrameRefCon;
    (void)infoFlags;
    if (status != noErr || sampleBuffer == NULL || !CMSampleBufferDataIsReady(sampleBuffer))
        return;

    // A sync sample (IDR) is a keyframe: the NotSync attachment is absent or false.
    BOOL keyframe = YES;
    CFArrayRef attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, false);
    if (attachments && CFArrayGetCount(attachments) > 0) {
        CFDictionaryRef dict = (CFDictionaryRef)CFArrayGetValueAtIndex(attachments, 0);
        CFBooleanRef notSync = NULL;
        if (CFDictionaryGetValueIfPresent(dict, kCMSampleAttachmentKey_NotSync,
                                          (const void **)&notSync) &&
            notSync != NULL) {
            keyframe = !CFBooleanGetValue(notSync);
        }
    }

    CMFormatDescriptionRef fmt = CMSampleBufferGetFormatDescription(sampleBuffer);

    // AVCC NAL length-prefix size (almost always 4); read it from the format desc.
    int nal_header_len = 4;
    if (fmt) {
        size_t pcount = 0;
        CMVideoFormatDescriptionGetH264ParameterSetAtIndex(fmt, 0, NULL, NULL, &pcount,
                                                           &nal_header_len);
    }

    NSMutableData *annexb = [NSMutableData data];

    // Prepend SPS/PPS (Annex-B) to keyframes so a freshly-joined decoder can start.
    if (keyframe && fmt) {
        size_t pcount = 0;
        CMVideoFormatDescriptionGetH264ParameterSetAtIndex(fmt, 0, NULL, NULL, &pcount, NULL);
        for (size_t i = 0; i < pcount; i++) {
            const uint8_t *pset = NULL;
            size_t plen = 0;
            if (CMVideoFormatDescriptionGetH264ParameterSetAtIndex(fmt, i, &pset, &plen, NULL,
                                                                   NULL) == noErr &&
                pset && plen > 0) {
                [annexb appendBytes:kAnnexBStartCode length:4];
                [annexb appendBytes:pset length:plen];
            }
        }
    }

    // Walk the elementary-stream NALUs and swap each AVCC length prefix for a start code.
    CMBlockBufferRef block = CMSampleBufferGetDataBuffer(sampleBuffer);
    size_t total = 0;
    char *data = NULL;
    if (block && CMBlockBufferGetDataPointer(block, 0, NULL, &total, &data) == noErr && data) {
        size_t offset = 0;
        while (offset + (size_t)nal_header_len <= total) {
            uint32_t nal_len = 0;
            for (int b = 0; b < nal_header_len; b++) {
                nal_len = (nal_len << 8) | (uint8_t)data[offset + b];
            }
            offset += nal_header_len;
            if (nal_len == 0 || offset + nal_len > total)
                break;
            [annexb appendBytes:kAnnexBStartCode length:4];
            [annexb appendBytes:(data + offset) length:nal_len];
            offset += nal_len;
        }
    }

    if (annexb.length == 0)
        return;

    uint64_t now_ms = (uint64_t)([[NSDate date] timeIntervalSince1970] * 1000.0);

    ErlNifEnv *e = enif_alloc_env();
    ErlNifBinary out;
    if (enif_alloc_binary(annexb.length, &out) == 0) {
        enif_free_env(e);
        return;
    }
    memcpy(out.data, annexb.bytes, annexb.length);

    ERL_NIF_TERM map = enif_make_new_map(e);
    enif_make_map_put(e, map, enif_make_atom(e, "bytes"), enif_make_binary(e, &out), &map);
    enif_make_map_put(e, map, enif_make_atom(e, "width"), enif_make_int(e, g_enc_w), &map);
    enif_make_map_put(e, map, enif_make_atom(e, "height"), enif_make_int(e, g_enc_h), &map);
    enif_make_map_put(e, map, enif_make_atom(e, "format"), enif_make_atom(e, "h264"), &map);
    enif_make_map_put(e, map, enif_make_atom(e, "timestamp_ms"), enif_make_uint64(e, now_ms), &map);
    enif_make_map_put(e, map, enif_make_atom(e, "keyframe"),
                      enif_make_atom(e, keyframe ? "true" : "false"), &map);

    ERL_NIF_TERM msg = enif_make_tuple3(e, enif_make_atom(e, "screencast"),
                                        enif_make_atom(e, "frame"), map);
    enif_send(NULL, &g_sc_pid, e, msg);
    enif_free_env(e);
}

// Lazily create the compression session from the first frame's dimensions. Must run
// on sc_queue.
static void sc_ensure_session(CVImageBufferRef image) {
    if (g_vt_session)
        return;
    size_t w = CVPixelBufferGetWidth(image);
    size_t h = CVPixelBufferGetHeight(image);
    if (w == 0 || h == 0)
        return;
    g_enc_w = (int)w;
    g_enc_h = (int)h;

    OSStatus s = VTCompressionSessionCreate(kCFAllocatorDefault, (int32_t)w, (int32_t)h,
                                            kCMVideoCodecType_H264, NULL, NULL, NULL, sc_vt_output,
                                            NULL, &g_vt_session);
    if (s != noErr || !g_vt_session) {
        NSLog(@"[mob/screencast] VTCompressionSessionCreate failed: %d", (int)s);
        g_vt_session = NULL;
        return;
    }

    VTSessionSetProperty(g_vt_session, kVTCompressionPropertyKey_RealTime, kCFBooleanTrue);
    // Constrained baseline, no B-frames: low latency + broad decoder compat (matches the
    // host Publisher's H264 mode-1 default + the FU-A payloader).
    VTSessionSetProperty(g_vt_session, kVTCompressionPropertyKey_ProfileLevel,
                         kVTProfileLevel_H264_Baseline_AutoLevel);
    VTSessionSetProperty(g_vt_session, kVTCompressionPropertyKey_AllowFrameReordering,
                         kCFBooleanFalse);

    CFNumberRef br = CFNumberCreate(NULL, kCFNumberIntType, &g_sc_bitrate);
    VTSessionSetProperty(g_vt_session, kVTCompressionPropertyKey_AverageBitRate, br);
    CFRelease(br);

    CFNumberRef fps = CFNumberCreate(NULL, kCFNumberIntType, &g_sc_fps);
    VTSessionSetProperty(g_vt_session, kVTCompressionPropertyKey_ExpectedFrameRate, fps);
    CFRelease(fps);

    double kf_sec = g_sc_keyframe_interval_ms / 1000.0;
    CFNumberRef kf = CFNumberCreate(NULL, kCFNumberDoubleType, &kf_sec);
    VTSessionSetProperty(g_vt_session, kVTCompressionPropertyKey_MaxKeyFrameIntervalDuration, kf);
    CFRelease(kf);

    VTCompressionSessionPrepareToEncodeFrames(g_vt_session);
    NSLog(@"[mob/screencast] capturing %zux%zu @ %dbps", w, h, g_sc_bitrate);
}

static void sc_teardown_session(void) {
    if (g_vt_session) {
        VTCompressionSessionCompleteFrames(g_vt_session, kCMTimeInvalid);
        VTCompressionSessionInvalidate(g_vt_session);
        CFRelease(g_vt_session);
        g_vt_session = NULL;
    }
}

// ── NIFs ─────────────────────────────────────────────────────────────────────

static ERL_NIF_TERM nif_screencast_start_stream(ErlNifEnv *env, int argc,
                                                const ERL_NIF_TERM argv[]) {
    (void)argc;
    ErlNifBinary bin;
    if (!enif_inspect_binary(env, argv[0], &bin) &&
        !enif_inspect_iolist_as_binary(env, argv[0], &bin)) {
        return enif_make_badarg(env);
    }
    NSString *json = [[NSString alloc] initWithBytes:bin.data
                                              length:bin.size
                                            encoding:NSUTF8StringEncoding];
    NSDictionary *opts =
        [NSJSONSerialization JSONObjectWithData:[json dataUsingEncoding:NSUTF8StringEncoding]
                                        options:0
                                          error:nil];
    if ([opts isKindOfClass:[NSDictionary class]]) {
        if (opts[@"bitrate"])
            g_sc_bitrate = [opts[@"bitrate"] intValue];
        if (opts[@"fps"])
            g_sc_fps = [opts[@"fps"] intValue];
        if (opts[@"keyframe_interval_ms"])
            g_sc_keyframe_interval_ms = [opts[@"keyframe_interval_ms"] intValue];
        if (opts[@"max_size"])
            g_sc_max_size = [opts[@"max_size"] intValue];
    }

    enif_self(env, &g_sc_pid);
    g_sc_have_pid = YES;
    g_sc_force_keyframe = NO;

    RPScreenRecorder *rec = [RPScreenRecorder sharedRecorder];
    rec.microphoneEnabled = NO;

    [rec startCaptureWithHandler:^(CMSampleBufferRef sampleBuffer, RPSampleBufferType bufferType,
                                   NSError *error) {
      if (error || bufferType != RPSampleBufferTypeVideo || sampleBuffer == NULL)
          return;
      if (!CMSampleBufferDataIsReady(sampleBuffer))
          return;
      // Retain across the async hop to sc_queue (the sample buffer owns the image buffer).
      CFRetain(sampleBuffer);
      dispatch_async(sc_queue(), ^{
        CVImageBufferRef image = CMSampleBufferGetImageBuffer(sampleBuffer);
        if (image) {
            sc_ensure_session(image);
            if (g_vt_session) {
                CMTime pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer);
                CFDictionaryRef frame_props = NULL;
                if (g_sc_force_keyframe) {
                    const void *k = kVTEncodeFrameOptionKey_ForceKeyFrame;
                    const void *v = kCFBooleanTrue;
                    frame_props = CFDictionaryCreate(NULL, &k, &v, 1,
                                                     &kCFTypeDictionaryKeyCallBacks,
                                                     &kCFTypeDictionaryValueCallBacks);
                    g_sc_force_keyframe = NO;
                }
                VTEncodeInfoFlags flags;
                VTCompressionSessionEncodeFrame(g_vt_session, image, pts, kCMTimeInvalid,
                                                frame_props, NULL, &flags);
                if (frame_props)
                    CFRelease(frame_props);
            }
        }
        CFRelease(sampleBuffer);
      });
    }
        completionHandler:^(NSError *error) {
          // Called once when capture starts (error == nil) or the user declines / it fails.
          sc_send_permission(error == nil ? "granted" : "denied");
          if (error)
              NSLog(@"[mob/screencast] startCapture failed: %@", error);
        }];

    return enif_make_atom(env, "ok");
}

static ERL_NIF_TERM nif_screencast_stop_stream(ErlNifEnv *env, int argc,
                                               const ERL_NIF_TERM argv[]) {
    (void)argc;
    (void)argv;
    [[RPScreenRecorder sharedRecorder] stopCaptureWithHandler:^(NSError *error) {
      (void)error;
    }];
    dispatch_async(sc_queue(), ^{
      sc_teardown_session();
    });
    return enif_make_atom(env, "ok");
}

static ERL_NIF_TERM nif_screencast_request_keyframe(ErlNifEnv *env, int argc,
                                                    const ERL_NIF_TERM argv[]) {
    (void)argc;
    (void)argv;
    g_sc_force_keyframe = YES;
    return enif_make_atom(env, "ok");
}

// ── Registration ──────────────────────────────────────────────────────────────
static ErlNifFunc nif_funcs[] = {
    {"screencast_start_stream", 1, nif_screencast_start_stream, 0},
    {"screencast_stop_stream", 0, nif_screencast_stop_stream, 0},
    {"screencast_request_keyframe", 0, nif_screencast_request_keyframe, 0},
};

ERL_NIF_INIT(mob_screencast_nif, nif_funcs, NULL, NULL, NULL, NULL)