diff --git a/README.md b/README.md index 8b1a7942..759befc3 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ view inside the editor. ## Features -- Supports streaming both iOS simulators and Android emulators +- Supports streaming iOS simulators and Android emulators, including WebRTC audio - Full simulator control & inspection using private iOS accessibility APIs and Android UIAutomator - available using `simdeck` CLI - Real-time screen `describe` command using accessibility view tree - available in token-efficient format for agents - Profiling built-in: CPU, memory, disk writes, network throughput, hang signals, and stack sampling diff --git a/docs/api/rest.md b/docs/api/rest.md index 5e5828fa..14469902 100644 --- a/docs/api/rest.md +++ b/docs/api/rest.md @@ -173,7 +173,7 @@ Performance query parameters: | `GET` | `/api/simulators/{udid}/control` | Alias for input control WebSocket | | `POST` | `/api/simulators/{udid}/refresh` | Request a fresh frame or keyframe | -For normal clients, copy the browser behavior instead of hand-coding a raw decoder. The UI supports WebRTC first and H.264 WebSocket fallback. +For normal clients, copy the browser behavior instead of hand-coding a raw decoder. The UI supports WebRTC first and H.264 WebSocket fallback. WebRTC carries H.264 video and, when the offer includes an audio receiver, an Opus simulator-audio track sourced from the selected simulator or emulator process tree. The H.264 WebSocket fallback is video-only. Minimal WebRTC request: @@ -194,7 +194,16 @@ Response: ```json { "type": "answer", - "sdp": "v=0..." + "sdp": "v=0...", + "audio": { + "codec": "opus", + "sampleRate": 48000, + "channels": 2 + }, + "video": { + "width": 1179, + "height": 2556 + } } ``` diff --git a/docs/guide/video.md b/docs/guide/video.md index 372ca1b4..b1a4ee4c 100644 --- a/docs/guide/video.md +++ b/docs/guide/video.md @@ -4,6 +4,10 @@ SimDeck streams live device video to the browser. Local sessions default to high iOS simulator H.264 uses VideoToolbox for hardware encoding and x264 for software encoding. +WebRTC streams also include simulator audio. The browser menu exposes a Sound +toggle so viewers can keep playback muted until they want to hear the device. +H.264 WebSocket fallback remains video-only. + ## When encoding runs SimDeck starts encoding when a browser stream needs H.264 frames. The server @@ -73,6 +77,17 @@ simdeck service restart --video-codec software --low-latency The browser tries WebRTC first. If WebRTC cannot render a frame, the UI can fall back to H.264 over WebSocket when the browser supports WebCodecs. +Audio is carried on the WebRTC path using a browser-native Opus track. On +macOS 14.2 and newer, SimDeck uses Core Audio process taps over the selected +simulator or emulator process tree, then routes that tap through a private +aggregate device into the WebRTC audio track. If macOS has not granted system +audio recording access, video still streams and the server logs the +audio-capture failure. While the tap is being read, Core Audio mutes the tapped +simulator process at the hardware output; browser playback is controlled by the +Sound toggle. Android emulators launched by SimDeck are started with host audio +enabled, so restart older no-audio emulator processes before testing Android +sound. + Force a mode while debugging: ```text diff --git a/packages/client/index.html b/packages/client/index.html index be7538db..72c8f4b7 100644 --- a/packages/client/index.html +++ b/packages/client/index.html @@ -24,5 +24,29 @@
+ diff --git a/packages/client/src/app/AppShell.tsx b/packages/client/src/app/AppShell.tsx index a670cea6..a685842d 100644 --- a/packages/client/src/app/AppShell.tsx +++ b/packages/client/src/app/AppShell.tsx @@ -54,7 +54,10 @@ import { simulatorUsesInsetChromeButtons, } from "../features/simulators/simulatorDisplay"; import { useSimulatorList } from "../features/simulators/useSimulatorList"; -import { sendWebRtcControlMessage } from "../features/stream/streamWorkerClient"; +import { + sendWebRtcControlMessage, + setActiveStreamAudioMuted, +} from "../features/stream/streamWorkerClient"; import type { StreamConfig, StreamEncoder, @@ -560,6 +563,8 @@ export function AppShell({ const [streamTransport, setStreamTransport] = useState( initialStreamTransportRef.current, ); + const [streamAudioMuted, setStreamAudioMuted] = useState(true); + const streamAudioMutedRef = useRef(streamAudioMuted); const [streamConfigApplyKey, setStreamConfigApplyKey] = useState(0); const [streamConfigReady, setStreamConfigReady] = useState(false); const [touchIndicators, setTouchIndicators] = useState([]); @@ -812,6 +817,7 @@ export function AppShell({ streamBackend, streamCanvasKey, } = useLiveStream({ + audioMuted: streamAudioMuted, canvasElement: streamCanvasElement, paused: !streamConfigReady, remote: remoteStream, @@ -877,6 +883,17 @@ export function AppShell({ [remoteStream], ); + const toggleStreamAudioMuted = useCallback(() => { + const next = !streamAudioMutedRef.current; + streamAudioMutedRef.current = next; + setActiveStreamAudioMuted(next); + setStreamAudioMuted(next); + }, []); + + useEffect(() => { + streamAudioMutedRef.current = streamAudioMuted; + }, [streamAudioMuted]); + useEffect(() => { if ( !selectedSimulator || @@ -2931,6 +2948,7 @@ export function AppShell({ onStreamFpsChange={updateStreamFps} onStreamQualityChange={updateStreamQuality} onStreamTransportChange={updateStreamTransport} + onToggleStreamAudioMuted={toggleStreamAudioMuted} onShutdown={() => { if (!selectedSimulator) { return; @@ -2989,6 +3007,7 @@ export function AppShell({ !selectedSimulatorTransitionKind, )} streamConfig={streamConfig} + streamAudioMuted={streamAudioMuted} streamTransport={streamTransport} simulatorMenuOpen={simulatorMenuOpen} simulatorMenuRef={simulatorMenuRef} diff --git a/packages/client/src/features/accessibility/AccessibilityOverlay.test.ts b/packages/client/src/features/accessibility/AccessibilityOverlay.test.ts index c9880849..2fe246f4 100644 --- a/packages/client/src/features/accessibility/AccessibilityOverlay.test.ts +++ b/packages/client/src/features/accessibility/AccessibilityOverlay.test.ts @@ -28,6 +28,15 @@ describe("accessibilityDomTagName", () => { }), ).toBe("simdeck-range-and-filter-bar"); }); + + it("falls back when component metadata is not string-like", () => { + expect( + accessibilityDomTagName({ + source: "in-app-inspector", + type: { kind: "Button" } as unknown as string, + }), + ).toBe("simdeck-element"); + }); }); describe("AccessibilityOverlay", () => { @@ -103,4 +112,42 @@ describe("AccessibilityOverlay", () => { expect(markup).not.toContain("; disabled"); expect(markup).not.toContain(" title="); }); + + it("renders object-shaped accessibility metadata without crashing", () => { + const markup = renderToStaticMarkup( + createElement(AccessibilityOverlay, { + hoveredId: null, + roots: [ + { + frame: { height: 844, width: 390, x: 0, y: 0 }, + role: "application", + children: [ + { + AXLabel: { localized: "Continue" } as unknown as string, + AXValue: 42 as unknown as string, + frame: { height: 48, width: 180, x: 105, y: 720 }, + nativeScript: { + testID: 123, + type: { kind: "Button" }, + }, + placeholder: false as unknown as string, + source: "nativescript", + sourceLocation: { + file: { path: "/app/app.component.ts" } as unknown as string, + line: 12, + }, + type: { kind: "Button" } as unknown as string, + }, + ], + }, + ], + selectedId: "", + }), + ); + + expect(markup).toContain(" { }), ).toBe("~/assets/album-midnight.jpg"); }); + + it("ignores object-shaped accessibility text fields without crashing", () => { + const node: AccessibilityNode = { + AXIdentifier: { id: "continue-button" } as unknown as string, + AXLabel: { localized: "Continue" } as unknown as string, + AXUniqueId: { value: "unique" } as unknown as string, + AXValue: 42 as unknown as string, + role: { name: "button" } as unknown as string, + source: "in-app-inspector", + title: { value: "Continue" } as unknown as string, + type: { kind: "Button" } as unknown as string, + }; + + expect(() => buildAccessibilityTree([node])).not.toThrow(); + expect(primaryAccessibilityText(node)).toBe("42"); + expect(accessibilityIdentifier(node)).toBe(""); + expect(accessibilityKind(node)).toBe("Element"); + }); }); describe("findAccessibilityItemAtPoint", () => { diff --git a/packages/client/src/features/accessibility/accessibilityTree.ts b/packages/client/src/features/accessibility/accessibilityTree.ts index 5c1290fd..a6c99615 100644 --- a/packages/client/src/features/accessibility/accessibilityTree.ts +++ b/packages/client/src/features/accessibility/accessibilityTree.ts @@ -579,9 +579,14 @@ function frameContainsPoint( ); } -function cleanText(value: string | null | undefined): string | null { - const trimmed = value?.trim(); - return trimmed ? trimmed : null; +function cleanText(value: unknown): string | null { + if (typeof value === "string") { + return value.trim() || null; + } + if (typeof value === "number" || typeof value === "boolean") { + return String(value); + } + return null; } function displayAccessibilityKind( diff --git a/packages/client/src/features/simulators/SimulatorMenu.tsx b/packages/client/src/features/simulators/SimulatorMenu.tsx index 0347f7bf..c6c35e49 100644 --- a/packages/client/src/features/simulators/SimulatorMenu.tsx +++ b/packages/client/src/features/simulators/SimulatorMenu.tsx @@ -34,6 +34,7 @@ interface SimulatorMenuProps { onStreamFpsChange: (fps: StreamFps) => void; onStreamQualityChange: (quality: StreamQualityPreset) => void; onStreamTransportChange: (transport: StreamTransport) => void; + onToggleStreamAudioMuted: () => void; onToggleAppearance: () => void; onToggleDebug: () => void; onToggleMenu: () => void; @@ -47,6 +48,7 @@ interface SimulatorMenuProps { showBootButton: boolean; showStopButton: boolean; streamConfig: StreamConfig; + streamAudioMuted: boolean; streamTransport: StreamTransport; touchOverlayVisible: boolean; } @@ -74,6 +76,7 @@ export function SimulatorMenu({ onStreamFpsChange, onStreamQualityChange, onStreamTransportChange, + onToggleStreamAudioMuted, onToggleAppearance, onToggleDebug, onToggleMenu, @@ -87,6 +90,7 @@ export function SimulatorMenu({ showBootButton, showStopButton, streamConfig, + streamAudioMuted, streamTransport, touchOverlayVisible, }: SimulatorMenuProps) { @@ -200,6 +204,14 @@ export function SimulatorMenu({ )} +
diff --git a/packages/client/src/features/simulators/simulatorDisplay.test.ts b/packages/client/src/features/simulators/simulatorDisplay.test.ts index 4965e490..40461ccc 100644 --- a/packages/client/src/features/simulators/simulatorDisplay.test.ts +++ b/packages/client/src/features/simulators/simulatorDisplay.test.ts @@ -30,6 +30,17 @@ describe("simulatorDisplay", () => { ).toBe("watchOS 26.0"); }); + it("ignores non-string runtime metadata", () => { + expect( + simulatorRuntimeLabel( + simulator({ + runtimeIdentifier: { identifier: "unexpected" } as unknown as string, + runtimeName: null as unknown as string, + }), + ), + ).toBe(""); + }); + it("enables native chrome for Apple Watch simulators", () => { expect( shouldRenderNativeChrome( diff --git a/packages/client/src/features/simulators/simulatorDisplay.ts b/packages/client/src/features/simulators/simulatorDisplay.ts index d1fa89bf..9cbaaf87 100644 --- a/packages/client/src/features/simulators/simulatorDisplay.ts +++ b/packages/client/src/features/simulators/simulatorDisplay.ts @@ -65,13 +65,14 @@ function simulatorMetadataText(simulator: SimulatorMetadata): string { simulator.runtimeName, simulator.runtimeIdentifier, ] - .filter(Boolean) + .map(metadataTextValue) + .filter((value): value is string => Boolean(value)) .join(" ") .toLowerCase(); } -function formatRuntimeLabel(value: string | undefined): string | null { - const trimmed = value?.trim(); +function formatRuntimeLabel(value: unknown): string | null { + const trimmed = metadataTextValue(value); if (!trimmed) { return null; } @@ -89,3 +90,13 @@ function formatRuntimeLabel(value: string | undefined): string | null { } return trimmed; } + +function metadataTextValue(value: unknown): string | null { + if (typeof value === "string") { + return value.trim() || null; + } + if (typeof value === "number" || typeof value === "boolean") { + return String(value); + } + return null; +} diff --git a/packages/client/src/features/stream/streamTypes.ts b/packages/client/src/features/stream/streamTypes.ts index 4e9b6099..acee1ae2 100644 --- a/packages/client/src/features/stream/streamTypes.ts +++ b/packages/client/src/features/stream/streamTypes.ts @@ -1,6 +1,7 @@ import type { Size } from "../viewport/types"; export interface StreamConnectTarget { + audioMuted?: boolean; clientId?: string; platform?: string; remote?: boolean; diff --git a/packages/client/src/features/stream/streamWorkerClient.ts b/packages/client/src/features/stream/streamWorkerClient.ts index b49842a4..200e594e 100644 --- a/packages/client/src/features/stream/streamWorkerClient.ts +++ b/packages/client/src/features/stream/streamWorkerClient.ts @@ -97,6 +97,10 @@ export function sendWebRtcStreamControl(options: { ); } +export function setActiveStreamAudioMuted(muted: boolean) { + activeStreamClient?.setAudioMuted(muted); +} + function sendStreamQualityConfig(config: StreamConfig): boolean { const encoded = JSON.stringify({ config: streamQualityPayload(config), @@ -230,6 +234,7 @@ function compareVideoToImage( export function buildStreamTarget( udid: string, options: { + audioMuted?: boolean; clientId?: string; platform?: string; remote?: boolean; @@ -238,6 +243,7 @@ export function buildStreamTarget( } = {}, ): StreamConnectTarget { return { + audioMuted: options.audioMuted, clientId: options.clientId, platform: options.platform, remote: options.remote, @@ -290,6 +296,7 @@ interface StreamClientBackend { disconnect(): void; applyStreamConfig?(config?: StreamConfig): void | Promise; sendControl?(payload: unknown): boolean; + setAudioMuted?(muted: boolean): void; } export interface VisualArtifactSample { @@ -389,6 +396,11 @@ interface WebCodecsVideoDecoderConstructor { } interface WebRtcAnswerPayload extends RTCSessionDescriptionInit { + audio?: { + channels?: number; + codec?: string; + sampleRate?: number; + }; video?: { height?: number; width?: number; @@ -1295,6 +1307,8 @@ function hexByte(byte: number): string { } class WebRtcStreamClient implements StreamClientBackend { + private audioElement: HTMLAudioElement | null = null; + private audioMuted = true; private animationFrame = 0; private canvas: HTMLCanvasElement | null = null; private canvasContext: CanvasRenderingContext2D | null = null; @@ -1408,6 +1422,7 @@ class WebRtcStreamClient implements StreamClientBackend { this.shouldReconnect = true; this.remoteMode = Boolean(target.remote); this.streamTarget = target; + this.audioMuted = target.audioMuted ?? true; if (!wasReconnecting) { this.reconnectDelayMs = WEBRTC_RECONNECT_BASE_DELAY_MS; } @@ -1435,6 +1450,14 @@ class WebRtcStreamClient implements StreamClientBackend { const useRgbaTransport = shouldUseLocalAndroidRgbaWebRtc(target); this.rgbaMode = useRgbaTransport; this.attachDiagnostics(peerConnection, target, generation); + const audioTransceiver = peerConnection.addTransceiver("audio", { + direction: "recvonly", + }); + configureAudioReceiverCodecPreferences(audioTransceiver); + configureLowLatencyReceiver( + audioTransceiver.receiver, + receiverBufferSeconds(target), + ); if (!useRgbaTransport) { this.startReceiverStatsPolling(peerConnection, target, generation); const transceiver = peerConnection.addTransceiver("video", { @@ -1485,17 +1508,21 @@ class WebRtcStreamClient implements StreamClientBackend { }; peerConnection.ontrack = (event) => { - if (useRgbaTransport) { + if (generation !== this.connectGeneration) { return; } - if (generation !== this.connectGeneration) { + if (event.track.kind === "audio") { + this.attachAudioTrack(event.track, generation); + return; + } + if (useRgbaTransport || event.track.kind !== "video") { return; } event.track.contentHint = "motion"; for (const receiver of peerConnection.getReceivers()) { configureLowLatencyReceiver(receiver, receiverBufferSeconds(target)); } - const stream = event.streams[0] ?? new MediaStream([event.track]); + const stream = new MediaStream([event.track]); const video = document.createElement("video"); video.autoplay = true; video.className = "stream-video"; @@ -1606,6 +1633,19 @@ class WebRtcStreamClient implements StreamClientBackend { return sendDataChannelMessage(this.controlChannel, JSON.stringify(payload)); } + setAudioMuted(muted: boolean) { + this.audioMuted = muted; + if (!this.audioElement) { + return; + } + this.audioElement.muted = muted; + if (!muted) { + void this.audioElement.play().catch(() => { + // Some browsers require the menu click that unmutes to happen in the page. + }); + } + } + async applyStreamConfig(config?: StreamConfig) { if (!config) { return; @@ -1703,6 +1743,12 @@ class WebRtcStreamClient implements StreamClientBackend { this.video.remove(); } this.video = null; + this.audioElement?.pause(); + if (this.audioElement) { + this.audioElement.srcObject = null; + this.audioElement.remove(); + } + this.audioElement = null; this.reportedVideoHeight = 0; this.reportedVideoWidth = 0; this.controlChannel?.close(); @@ -2122,6 +2168,36 @@ class WebRtcStreamClient implements StreamClientBackend { } } + private attachAudioTrack(track: MediaStreamTrack, generation: number) { + this.audioElement?.pause(); + if (this.audioElement) { + this.audioElement.srcObject = null; + this.audioElement.remove(); + } + const audio = document.createElement("audio"); + audio.autoplay = true; + audio.muted = this.audioMuted; + audio.preload = "auto"; + audio.srcObject = new MediaStream([track]); + audio.style.display = "none"; + document.body.appendChild(audio); + this.audioElement = audio; + const startPlayback = () => { + if ( + generation !== this.connectGeneration || + audio !== this.audioElement + ) { + return; + } + void audio.play().catch(() => { + // Muted autoplay is best effort; unmuting from the menu retries playback. + }); + }; + audio.addEventListener("loadedmetadata", startPlayback); + audio.addEventListener("canplay", startPlayback); + startPlayback(); + } + private attachRgbaDataChannel(channel: RTCDataChannel, generation: number) { this.rgbaChannel?.close(); this.rgbaChannel = channel; @@ -2756,6 +2832,26 @@ function configureReceiverCodecPreferences(transceiver: RTCRtpTransceiver) { ]); } +function configureAudioReceiverCodecPreferences( + transceiver: RTCRtpTransceiver, +) { + if (!transceiver.setCodecPreferences) { + return; + } + const capabilities = RTCRtpReceiver.getCapabilities("audio"); + const codecs = capabilities?.codecs ?? []; + const preferred = codecs.filter( + (codec) => codec.mimeType.toLowerCase() === "audio/opus", + ); + if (preferred.length === 0) { + return; + } + transceiver.setCodecPreferences([ + ...preferred, + ...codecs.filter((codec) => codec.mimeType.toLowerCase() !== "audio/opus"), + ]); +} + function safariBaselineH264Offer( offer: RTCSessionDescriptionInit, ): RTCSessionDescriptionInit { @@ -3033,6 +3129,10 @@ export class StreamWorkerClient { ); } + setAudioMuted(muted: boolean) { + this.backend?.setAudioMuted?.(muted); + } + applyStreamConfig(config?: StreamConfig) { try { const result = this.backend?.applyStreamConfig?.(config); diff --git a/packages/client/src/features/stream/useLiveStream.ts b/packages/client/src/features/stream/useLiveStream.ts index 77990731..350c4f53 100644 --- a/packages/client/src/features/stream/useLiveStream.ts +++ b/packages/client/src/features/stream/useLiveStream.ts @@ -28,6 +28,7 @@ const CLIENT_TELEMETRY_ID_STORAGE_KEY = "simdeck.streamClientId"; const VISUAL_ARTIFACT_TELEMETRY_INTERVAL_MS = 30000; interface UseLiveStreamOptions { + audioMuted?: boolean; canvasElement: HTMLCanvasElement | null; paused?: boolean; remote?: boolean; @@ -108,6 +109,7 @@ function isViewerForeground(canvasVisible: boolean): boolean { } export function useLiveStream({ + audioMuted = true, canvasElement, paused = false, remote = false, @@ -370,6 +372,7 @@ export function useLiveStream({ workerClient.connect( buildStreamTarget(simulator.udid, { clientId: clientTelemetryIdRef.current, + audioMuted, platform: simulator.platform, remote, streamConfig, @@ -387,8 +390,13 @@ export function useLiveStream({ paused, remote, streamTransport, + audioMuted, ]); + useEffect(() => { + workerClientRef.current?.setAudioMuted(audioMuted); + }, [audioMuted]); + useEffect(() => { if (!simulator?.udid || paused) { return; diff --git a/packages/client/src/features/toolbar/Toolbar.tsx b/packages/client/src/features/toolbar/Toolbar.tsx index 128eb85a..ad72b4d9 100644 --- a/packages/client/src/features/toolbar/Toolbar.tsx +++ b/packages/client/src/features/toolbar/Toolbar.tsx @@ -51,6 +51,7 @@ interface ToolbarProps { onStreamFpsChange: (fps: StreamFps) => void; onStreamQualityChange: (quality: StreamQualityPreset) => void; onStreamTransportChange: (transport: StreamTransport) => void; + onToggleStreamAudioMuted: () => void; onToggleAppearance: () => void; onToggleDebug: () => void; onToggleDevTools: () => void; @@ -71,6 +72,7 @@ interface ToolbarProps { showBootButton: boolean; showStopButton: boolean; streamConfig: StreamConfig; + streamAudioMuted: boolean; streamTransport: StreamTransport; touchOverlayVisible: boolean; menuOpen: boolean; @@ -114,6 +116,7 @@ export function Toolbar({ onStreamFpsChange, onStreamQualityChange, onStreamTransportChange, + onToggleStreamAudioMuted, onToggleAppearance, onToggleDebug, onToggleDevTools, @@ -133,6 +136,7 @@ export function Toolbar({ showBootButton, showStopButton, streamConfig, + streamAudioMuted, streamTransport, simulatorMenuOpen, simulatorMenuRef, @@ -194,6 +198,7 @@ export function Toolbar({ onStreamFpsChange={onStreamFpsChange} onStreamQualityChange={onStreamQualityChange} onStreamTransportChange={onStreamTransportChange} + onToggleStreamAudioMuted={onToggleStreamAudioMuted} onToggleAppearance={onToggleAppearance} onToggleDebug={onToggleDebug} onToggleMenu={onToggleMenu} @@ -208,6 +213,7 @@ export function Toolbar({ showStopButton={showStopButton} canInstallApp={canInstallApp} streamConfig={streamConfig} + streamAudioMuted={streamAudioMuted} streamTransport={streamTransport} touchOverlayVisible={touchOverlayVisible} /> diff --git a/packages/client/src/styles/components.css b/packages/client/src/styles/components.css index 7fb17266..2c7ca799 100644 --- a/packages/client/src/styles/components.css +++ b/packages/client/src/styles/components.css @@ -413,6 +413,22 @@ text-transform: uppercase; } +.menu-toggle { + display: flex; + align-items: center; + gap: 8px; + min-height: 30px; + color: var(--text); + font-size: 12px; +} + +.menu-toggle input { + width: 15px; + height: 15px; + flex: 0 0 auto; + accent-color: var(--accent); +} + .menu-select { width: 100%; height: 32px; diff --git a/packages/server/build.rs b/packages/server/build.rs index 7ecccf1d..6a9ca7c4 100644 --- a/packages/server/build.rs +++ b/packages/server/build.rs @@ -101,7 +101,9 @@ fn main() { "Foundation", "Accelerate", "AppKit", + "AudioToolbox", "AVFoundation", + "CoreAudio", "CoreImage", "CoreGraphics", "CoreMedia", diff --git a/packages/server/native/bridge/XCWNativeBridge.h b/packages/server/native/bridge/XCWNativeBridge.h index 35ffc647..cd6d6fca 100644 --- a/packages/server/native/bridge/XCWNativeBridge.h +++ b/packages/server/native/bridge/XCWNativeBridge.h @@ -28,7 +28,15 @@ typedef struct xcw_native_frame { xcw_native_shared_bytes data; } xcw_native_frame; +typedef struct xcw_native_audio_sample { + uint64_t timestamp_us; + uint32_t sample_rate; + uint16_t channels; + xcw_native_shared_bytes data; +} xcw_native_audio_sample; + typedef void (*xcw_native_frame_callback)(const xcw_native_frame * _Nonnull frame, void * _Nullable user_data); +typedef void (*xcw_native_audio_callback)(const xcw_native_audio_sample * _Nonnull sample, void * _Nullable user_data); void xcw_native_initialize_app(void); void xcw_native_run_main_loop_slice(double duration_seconds); @@ -109,6 +117,10 @@ void xcw_native_h264_encoder_destroy(void * _Nullable handle); bool xcw_native_h264_encoder_encode_rgba(void * _Nonnull handle, const uint8_t * _Nonnull rgba, size_t length, uint32_t width, uint32_t height, uint64_t timestamp_us, char * _Nullable * _Nullable error_message); void xcw_native_h264_encoder_request_keyframe(void * _Nonnull handle); +void * _Nullable xcw_native_audio_capture_create(const int32_t * _Nullable process_ids, size_t process_count, xcw_native_audio_callback _Nullable callback, void * _Nullable user_data, char * _Nullable * _Nullable error_message); +bool xcw_native_audio_capture_update_processes(void * _Nonnull handle, const int32_t * _Nullable process_ids, size_t process_count, char * _Nullable * _Nullable error_message); +void xcw_native_audio_capture_destroy(void * _Nullable handle); + void xcw_native_free_string(char * _Nullable value); void xcw_native_free_bytes(xcw_native_owned_bytes bytes); void xcw_native_release_shared_bytes(xcw_native_shared_bytes bytes); diff --git a/packages/server/native/bridge/XCWNativeBridge.m b/packages/server/native/bridge/XCWNativeBridge.m index 1c2746d7..8292ce6b 100644 --- a/packages/server/native/bridge/XCWNativeBridge.m +++ b/packages/server/native/bridge/XCWNativeBridge.m @@ -8,8 +8,15 @@ #import "XCWSimctl.h" #import +#import +#import +#import +#import #import +#import #import +#include +#include #include #include @@ -315,6 +322,820 @@ - (void)invalidate { @end +static NSString *XCWAudioDictionaryKey(const char *key) { + return [NSString stringWithUTF8String:key] ?: @""; +} + +static NSString *XCWAudioOSStatusString(OSStatus status) { + UInt32 code = CFSwapInt32HostToBig((UInt32)status); + char text[5] = {0}; + memcpy(text, &code, 4); + BOOL printable = YES; + for (NSUInteger index = 0; index < 4; index++) { + if (text[index] < 32 || text[index] > 126) { + printable = NO; + break; + } + } + if (printable) { + return [NSString stringWithFormat:@"%d ('%s')", (int)status, text]; + } + return [NSString stringWithFormat:@"%d", (int)status]; +} + +static NSError *XCWAudioCaptureError(NSInteger code, NSString *description) { + return [NSError errorWithDomain:@"SimDeck.AudioCapture" + code:code + userInfo:@{ NSLocalizedDescriptionKey: description ?: @"Audio capture failed." }]; +} + +static NSError *XCWAudioCaptureStatusError(NSInteger code, NSString *operation, OSStatus status) { + return XCWAudioCaptureError(code, [NSString stringWithFormat:@"%@ failed with OSStatus %@.", operation, XCWAudioOSStatusString(status)]); +} + +static const uint32_t XCWOpusSampleRate = 48000; +static const uint16_t XCWOpusChannels = 2; +static const UInt32 XCWOpusFramesPerPacket = 960; +static const UInt32 XCWOpusBitRate = 96000; +static const UInt32 XCWOpusFallbackMaxPacketBytes = 1500; +static const NSUInteger XCWAudioProcessStableRefreshes = 3; +static const OSStatus XCWAudioConverterNoDataStatus = -1; + +static int16_t XCWClampPCM16(double value) { + if (!isfinite(value)) { + return 0; + } + if (value <= -1.0) { + return INT16_MIN; + } + if (value >= 1.0) { + return INT16_MAX; + } + return (int16_t)lrint(value * 32767.0); +} + +static int16_t XCWReadPCM16Sample(const AudioBufferList *bufferList, + const AudioStreamBasicDescription *asbd, + NSUInteger frame, + NSUInteger channel) { + if (bufferList == NULL || asbd == NULL || bufferList->mNumberBuffers == 0) { + return 0; + } + + const UInt32 bitsPerChannel = asbd->mBitsPerChannel; + const NSUInteger bytesPerSample = MAX((NSUInteger)bitsPerChannel / 8, 1); + const BOOL nonInterleaved = (asbd->mFormatFlags & kAudioFormatFlagIsNonInterleaved) != 0; + const BOOL isFloat = (asbd->mFormatFlags & kAudioFormatFlagIsFloat) != 0; + const BOOL isSigned = (asbd->mFormatFlags & kAudioFormatFlagIsSignedInteger) != 0; + const BOOL isBigEndian = (asbd->mFormatFlags & kAudioFormatFlagIsBigEndian) != 0; + const NSUInteger sourceChannels = MAX((NSUInteger)asbd->mChannelsPerFrame, 1); + const NSUInteger bufferIndex = nonInterleaved + ? MIN(channel, (NSUInteger)bufferList->mNumberBuffers - 1) + : 0; + const NSUInteger channelInBuffer = nonInterleaved ? 0 : MIN(channel, sourceChannels - 1); + const AudioBuffer buffer = bufferList->mBuffers[bufferIndex]; + if (buffer.mData == NULL || buffer.mDataByteSize == 0) { + return 0; + } + + const NSUInteger fallbackBytesPerFrame = bytesPerSample * (nonInterleaved ? 1 : sourceChannels); + const NSUInteger bytesPerFrame = MAX((NSUInteger)asbd->mBytesPerFrame, fallbackBytesPerFrame); + const NSUInteger offset = frame * bytesPerFrame + channelInBuffer * bytesPerSample; + if (offset + bytesPerSample > buffer.mDataByteSize) { + return 0; + } + + const uint8_t *sample = (const uint8_t *)buffer.mData + offset; + if (isFloat && bytesPerSample == sizeof(float)) { + float value = 0.0f; + memcpy(&value, sample, sizeof(value)); + return XCWClampPCM16((double)value); + } + if (isFloat && bytesPerSample == sizeof(double)) { + double value = 0.0; + memcpy(&value, sample, sizeof(value)); + return XCWClampPCM16(value); + } + if (bytesPerSample == sizeof(int16_t)) { + uint16_t raw = 0; + memcpy(&raw, sample, sizeof(raw)); + if (isBigEndian) { + raw = CFSwapInt16BigToHost(raw); + } + return (int16_t)raw; + } + if (bytesPerSample == sizeof(int32_t)) { + uint32_t raw = 0; + memcpy(&raw, sample, sizeof(raw)); + if (isBigEndian) { + raw = CFSwapInt32BigToHost(raw); + } + return (int16_t)(((int32_t)raw) >> 16); + } + if (bytesPerSample == sizeof(uint8_t)) { + if (isSigned) { + return (int16_t)(((int8_t)sample[0]) << 8); + } + return (int16_t)(((int)sample[0] - 128) << 8); + } + + return 0; +} + +static NSUInteger XCWAudioFrameCount(const AudioBufferList *bufferList, + const AudioStreamBasicDescription *asbd) { + if (bufferList == NULL || asbd == NULL || bufferList->mNumberBuffers == 0) { + return 0; + } + const AudioBuffer buffer = bufferList->mBuffers[0]; + if (buffer.mData == NULL || buffer.mDataByteSize == 0) { + return 0; + } + const NSUInteger bytesPerSample = MAX((NSUInteger)asbd->mBitsPerChannel / 8, 1); + const BOOL nonInterleaved = (asbd->mFormatFlags & kAudioFormatFlagIsNonInterleaved) != 0; + const NSUInteger sourceChannels = MAX((NSUInteger)asbd->mChannelsPerFrame, 1); + const NSUInteger fallbackBytesPerFrame = bytesPerSample * (nonInterleaved ? 1 : sourceChannels); + const NSUInteger bytesPerFrame = MAX((NSUInteger)asbd->mBytesPerFrame, fallbackBytesPerFrame); + if (bytesPerFrame == 0) { + return 0; + } + return (NSUInteger)buffer.mDataByteSize / bytesPerFrame; +} + +static NSData *XCWPCM16InterleavedDataFromAudioBufferList(const AudioBufferList *bufferList, + const AudioStreamBasicDescription *asbd, + uint32_t *sampleRate, + uint16_t *channels) { + if (bufferList == NULL || asbd == NULL || asbd->mFormatID != kAudioFormatLinearPCM) { + return nil; + } + const NSUInteger frameCount = XCWAudioFrameCount(bufferList, asbd); + const NSUInteger sourceChannels = MAX((NSUInteger)asbd->mChannelsPerFrame, 1); + const NSUInteger outputChannels = MIN(sourceChannels, (NSUInteger)2); + if (frameCount == 0 || outputChannels == 0) { + return nil; + } + + NSMutableData *output = [NSMutableData dataWithLength:frameCount * outputChannels * sizeof(int16_t)]; + int16_t *outputSamples = (int16_t *)output.mutableBytes; + for (NSUInteger frame = 0; frame < frameCount; frame++) { + for (NSUInteger channel = 0; channel < outputChannels; channel++) { + outputSamples[frame * outputChannels + channel] = XCWReadPCM16Sample(bufferList, asbd, frame, channel); + } + } + + if (sampleRate != NULL) { + *sampleRate = (uint32_t)llround(asbd->mSampleRate > 0 ? asbd->mSampleRate : 48000.0); + } + if (channels != NULL) { + *channels = (uint16_t)outputChannels; + } + return output; +} + +static uint64_t XCWAudioTimestampUS(const AudioTimeStamp *timeStamp) { + if (timeStamp != NULL && (timeStamp->mFlags & kAudioTimeStampHostTimeValid) != 0 && timeStamp->mHostTime != 0) { + return AudioConvertHostTimeToNanos(timeStamp->mHostTime) / 1000; + } + return (uint64_t)llround([[NSDate date] timeIntervalSince1970] * 1000000.0); +} + +static AudioObjectID XCWAudioProcessObjectIDForPID(pid_t pid) { + if (pid <= 0) { + return kAudioObjectUnknown; + } + AudioObjectPropertyAddress address = { + .mSelector = kAudioHardwarePropertyTranslatePIDToProcessObject, + .mScope = kAudioObjectPropertyScopeGlobal, + .mElement = kAudioObjectPropertyElementMain, + }; + AudioObjectID processObjectID = kAudioObjectUnknown; + UInt32 size = sizeof(processObjectID); + OSStatus status = AudioObjectGetPropertyData(kAudioObjectSystemObject, + &address, + sizeof(pid), + &pid, + &size, + &processObjectID); + if (status != noErr) { + return kAudioObjectUnknown; + } + return processObjectID; +} + +static NSArray *XCWAudioProcessObjectIDsForProcessIDs(const int32_t *processIDs, + size_t processCount) { + NSMutableSet *seen = [NSMutableSet set]; + NSMutableArray *objects = [NSMutableArray array]; + for (size_t index = 0; index < processCount; index++) { + pid_t pid = (pid_t)processIDs[index]; + if (pid <= 0) { + continue; + } + AudioObjectID objectID = XCWAudioProcessObjectIDForPID(pid); + if (objectID == kAudioObjectUnknown) { + continue; + } + NSNumber *boxed = @(objectID); + if ([seen containsObject:boxed]) { + continue; + } + [seen addObject:boxed]; + [objects addObject:boxed]; + } + [objects sortUsingSelector:@selector(compare:)]; + return objects; +} + +static CATapDescription *XCWAudioTapDescription(NSArray *processObjectIDs) API_AVAILABLE(macos(14.2)) { + CATapDescription *description = [[CATapDescription alloc] initStereoMixdownOfProcesses:processObjectIDs]; + description.name = @"SimDeck Simulator Audio"; + description.privateTap = YES; + description.muteBehavior = CATapMutedWhenTapped; + description.mixdown = YES; + description.mono = NO; + description.exclusive = NO; + return description; +} + +static NSString *XCWAudioTapUID(AudioObjectID tapID, NSError * _Nullable __autoreleasing *error) { + AudioObjectPropertyAddress address = { + .mSelector = kAudioTapPropertyUID, + .mScope = kAudioObjectPropertyScopeGlobal, + .mElement = kAudioObjectPropertyElementMain, + }; + CFStringRef uid = NULL; + UInt32 size = sizeof(uid); + OSStatus status = AudioObjectGetPropertyData(tapID, &address, 0, NULL, &size, &uid); + if (status != noErr || uid == NULL) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(22, @"Read Core Audio tap UID", status); + } + return nil; + } + return CFBridgingRelease(uid); +} + +static BOOL XCWAudioGetObjectStreamFormat(AudioObjectID objectID, + AudioObjectPropertySelector selector, + AudioObjectPropertyScope scope, + AudioStreamBasicDescription *asbd) { + if (asbd == NULL || objectID == kAudioObjectUnknown) { + return NO; + } + AudioObjectPropertyAddress address = { + .mSelector = selector, + .mScope = scope, + .mElement = kAudioObjectPropertyElementMain, + }; + UInt32 size = sizeof(*asbd); + OSStatus status = AudioObjectGetPropertyData(objectID, &address, 0, NULL, &size, asbd); + return status == noErr && asbd->mSampleRate > 0 && asbd->mChannelsPerFrame > 0; +} + +typedef struct XCWOpusInputContext { + const uint8_t *bytes; + UInt32 byteCount; + UInt32 bytesPerFrame; + UInt16 channels; + UInt32 consumedBytes; +} XCWOpusInputContext; + +static OSStatus XCWOpusEncoderInputProc(AudioConverterRef inAudioConverter, + UInt32 *ioNumberDataPackets, + AudioBufferList *ioData, + AudioStreamPacketDescription **outDataPacketDescription, + void *inUserData) { + (void)inAudioConverter; + if (outDataPacketDescription != NULL) { + *outDataPacketDescription = NULL; + } + if (ioNumberDataPackets == NULL || ioData == NULL || inUserData == NULL) { + return paramErr; + } + XCWOpusInputContext *context = (XCWOpusInputContext *)inUserData; + if (context->bytes == NULL || context->bytesPerFrame == 0 || context->consumedBytes >= context->byteCount) { + *ioNumberDataPackets = 0; + return XCWAudioConverterNoDataStatus; + } + + UInt32 availableBytes = context->byteCount - context->consumedBytes; + UInt32 availablePackets = availableBytes / context->bytesPerFrame; + UInt32 packets = MIN(*ioNumberDataPackets, availablePackets); + if (packets == 0) { + *ioNumberDataPackets = 0; + return XCWAudioConverterNoDataStatus; + } + + UInt32 bytes = packets * context->bytesPerFrame; + ioData->mNumberBuffers = 1; + ioData->mBuffers[0].mNumberChannels = context->channels; + ioData->mBuffers[0].mDataByteSize = bytes; + ioData->mBuffers[0].mData = (void *)(context->bytes + context->consumedBytes); + context->consumedBytes += bytes; + *ioNumberDataPackets = packets; + return noErr; +} + +@interface XCWOpusAudioEncoder : NSObject + +@property (nonatomic, readonly) uint16_t channels; + +- (NSArray *)encodePCM:(NSData *)pcm + sampleRate:(uint32_t)sampleRate + channels:(uint16_t)channels + error:(NSError * _Nullable __autoreleasing *)error; +- (void)invalidate; + +@end + +@implementation XCWOpusAudioEncoder { + AudioConverterRef _converter; + NSMutableData *_pendingPCM; + uint32_t _inputSampleRate; + uint16_t _inputChannels; + UInt32 _inputBytesPerFrame; + UInt32 _maxOutputPacketSize; + NSUInteger _inputFramesPerOpusPacket; +} + +- (instancetype)init { + self = [super init]; + if (self == nil) { + return nil; + } + _pendingPCM = [NSMutableData data]; + _channels = XCWOpusChannels; + return self; +} + +- (void)dealloc { + [self invalidate]; +} + +- (NSArray *)encodePCM:(NSData *)pcm + sampleRate:(uint32_t)sampleRate + channels:(uint16_t)channels + error:(NSError * _Nullable __autoreleasing *)error { + if (pcm.length == 0 || sampleRate == 0 || channels == 0) { + return @[]; + } + if (_converter == NULL || _inputSampleRate != sampleRate || _inputChannels != channels) { + [self invalidate]; + if (![self configureWithSampleRate:sampleRate channels:channels error:error]) { + return @[]; + } + } + + [_pendingPCM appendData:pcm]; + NSMutableArray *packets = [NSMutableArray array]; + while ([self pendingFrameCount] >= _inputFramesPerOpusPacket) { + NSData *packet = [self encodeNextPacket:error]; + if (packet == nil) { + break; + } + if (packet.length > 0) { + [packets addObject:packet]; + } + } + return packets; +} + +- (BOOL)configureWithSampleRate:(uint32_t)sampleRate + channels:(uint16_t)channels + error:(NSError * _Nullable __autoreleasing *)error { + _inputSampleRate = sampleRate; + _inputChannels = channels; + _inputBytesPerFrame = MAX((UInt32)channels * (UInt32)sizeof(int16_t), 1); + _inputFramesPerOpusPacket = MAX((NSUInteger)llround(((double)sampleRate * (double)XCWOpusFramesPerPacket) / (double)XCWOpusSampleRate), (NSUInteger)1); + + AudioStreamBasicDescription input = {0}; + input.mSampleRate = sampleRate; + input.mFormatID = kAudioFormatLinearPCM; + input.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked; + input.mBytesPerPacket = _inputBytesPerFrame; + input.mFramesPerPacket = 1; + input.mBytesPerFrame = _inputBytesPerFrame; + input.mChannelsPerFrame = channels; + input.mBitsPerChannel = 16; + + AudioStreamBasicDescription output = {0}; + output.mSampleRate = XCWOpusSampleRate; + output.mFormatID = kAudioFormatOpus; + output.mChannelsPerFrame = XCWOpusChannels; + output.mFramesPerPacket = XCWOpusFramesPerPacket; + + OSStatus status = AudioConverterNew(&input, &output, &_converter); + if (status != noErr || _converter == NULL) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(31, @"Create Core Audio Opus encoder", status); + } + _converter = NULL; + return NO; + } + + UInt32 bitRate = XCWOpusBitRate; + (void)AudioConverterSetProperty(_converter, kAudioConverterEncodeBitRate, sizeof(bitRate), &bitRate); + + _maxOutputPacketSize = 0; + UInt32 propertySize = sizeof(_maxOutputPacketSize); + status = AudioConverterGetProperty(_converter, + kAudioConverterPropertyMaximumOutputPacketSize, + &propertySize, + &_maxOutputPacketSize); + if (status != noErr || _maxOutputPacketSize == 0) { + _maxOutputPacketSize = XCWOpusFallbackMaxPacketBytes; + } + _maxOutputPacketSize = MIN(MAX(_maxOutputPacketSize, (UInt32)256), (UInt32)4096); + [_pendingPCM setLength:0]; + return YES; +} + +- (NSUInteger)pendingFrameCount { + if (_inputBytesPerFrame == 0) { + return 0; + } + return _pendingPCM.length / _inputBytesPerFrame; +} + +- (NSData *)encodeNextPacket:(NSError * _Nullable __autoreleasing *)error { + if (_converter == NULL || _inputBytesPerFrame == 0 || _inputFramesPerOpusPacket == 0) { + return nil; + } + const NSUInteger inputBytes = MIN(_pendingPCM.length, _inputFramesPerOpusPacket * (NSUInteger)_inputBytesPerFrame); + if (inputBytes == 0 || inputBytes > UINT32_MAX) { + return nil; + } + + XCWOpusInputContext context = { + .bytes = (const uint8_t *)_pendingPCM.bytes, + .byteCount = (UInt32)inputBytes, + .bytesPerFrame = _inputBytesPerFrame, + .channels = _inputChannels, + .consumedBytes = 0, + }; + NSMutableData *output = [NSMutableData dataWithLength:_maxOutputPacketSize]; + AudioBufferList outputBuffer = { + .mNumberBuffers = 1, + .mBuffers = { + { + .mNumberChannels = XCWOpusChannels, + .mDataByteSize = _maxOutputPacketSize, + .mData = output.mutableBytes, + }, + }, + }; + UInt32 outputPackets = 1; + AudioStreamPacketDescription packetDescription = {0}; + OSStatus status = AudioConverterFillComplexBuffer(_converter, + XCWOpusEncoderInputProc, + &context, + &outputPackets, + &outputBuffer, + &packetDescription); + if (context.consumedBytes > 0 && context.consumedBytes <= _pendingPCM.length) { + [_pendingPCM replaceBytesInRange:NSMakeRange(0, context.consumedBytes) + withBytes:NULL + length:0]; + } + if (status == XCWAudioConverterNoDataStatus || outputPackets == 0 || outputBuffer.mBuffers[0].mDataByteSize == 0) { + if (context.consumedBytes == 0) { + return nil; + } + return [NSData data]; + } + if (status != noErr) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(32, @"Encode Opus audio packet", status); + } + return nil; + } + output.length = outputBuffer.mBuffers[0].mDataByteSize; + return output; +} + +- (void)invalidate { + if (_converter != NULL) { + AudioConverterDispose(_converter); + _converter = NULL; + } + [_pendingPCM setLength:0]; + _inputSampleRate = 0; + _inputChannels = 0; + _inputBytesPerFrame = 0; + _maxOutputPacketSize = 0; + _inputFramesPerOpusPacket = 0; +} + +@end + +@class XCWNativeAudioCapture; +static OSStatus XCWNativeAudioDeviceIOProc(AudioObjectID inDevice, + const AudioTimeStamp *inNow, + const AudioBufferList *inInputData, + const AudioTimeStamp *inInputTime, + AudioBufferList *outOutputData, + const AudioTimeStamp *inOutputTime, + void *inClientData); + +@interface XCWNativeAudioCapture : NSObject + +- (instancetype)initWithAudioCallback:(xcw_native_audio_callback)callback + userData:(void *)userData; +- (BOOL)startWithProcessIDs:(const int32_t *)processIDs + count:(size_t)processCount + error:(NSError * _Nullable __autoreleasing *)error; +- (BOOL)updateProcessIDs:(const int32_t *)processIDs + count:(size_t)processCount + error:(NSError * _Nullable __autoreleasing *)error; +- (BOOL)applyProcessIDs:(const int32_t *)processIDs + count:(size_t)processCount + requireProcesses:(BOOL)requireProcesses + debounceChanges:(BOOL)debounceChanges + error:(NSError * _Nullable __autoreleasing *)error; +- (BOOL)shouldApplyProcessObjectIDs:(NSArray *)processObjectIDs; +- (void)clearPendingProcessObjectIDs; +- (void)invalidate; +- (void)handleInputData:(const AudioBufferList *)inputData + inputTime:(const AudioTimeStamp *)inputTime; + +@end + +@implementation XCWNativeAudioCapture { + xcw_native_audio_callback _callback; + void *_callbackUserData; + BOOL _invalidated; + AudioObjectID _tapID; + AudioObjectID _aggregateDeviceID; + AudioDeviceIOProcID _ioProcID; + AudioStreamBasicDescription _streamDescription; + NSArray *_processObjectIDs; + NSArray *_pendingProcessObjectIDs; + NSUInteger _pendingProcessObjectIDRefreshes; + XCWOpusAudioEncoder *_opusEncoder; +} + +- (instancetype)initWithAudioCallback:(xcw_native_audio_callback)callback + userData:(void *)userData { + self = [super init]; + if (self == nil) { + return nil; + } + _callback = callback; + _callbackUserData = userData; + _tapID = kAudioObjectUnknown; + _aggregateDeviceID = kAudioObjectUnknown; + _ioProcID = NULL; + _processObjectIDs = @[]; + _pendingProcessObjectIDs = nil; + _pendingProcessObjectIDRefreshes = 0; + _opusEncoder = [[XCWOpusAudioEncoder alloc] init]; + return self; +} + +- (void)dealloc { + [self invalidate]; +} + +- (BOOL)startWithProcessIDs:(const int32_t *)processIDs + count:(size_t)processCount + error:(NSError * _Nullable __autoreleasing *)error { + return [self applyProcessIDs:processIDs count:processCount requireProcesses:YES debounceChanges:NO error:error]; +} + +- (BOOL)updateProcessIDs:(const int32_t *)processIDs + count:(size_t)processCount + error:(NSError * _Nullable __autoreleasing *)error { + return [self applyProcessIDs:processIDs count:processCount requireProcesses:NO debounceChanges:YES error:error]; +} + +- (BOOL)applyProcessIDs:(const int32_t *)processIDs + count:(size_t)processCount + requireProcesses:(BOOL)requireProcesses + debounceChanges:(BOOL)debounceChanges + error:(NSError * _Nullable __autoreleasing *)error { + if (@available(macOS 14.2, *)) { + NSArray *processObjectIDs = XCWAudioProcessObjectIDsForProcessIDs(processIDs, processCount); + if (_aggregateDeviceID != kAudioObjectUnknown && [_processObjectIDs isEqualToArray:processObjectIDs]) { + [self clearPendingProcessObjectIDs]; + return YES; + } + if (debounceChanges && _aggregateDeviceID != kAudioObjectUnknown && ![self shouldApplyProcessObjectIDs:processObjectIDs]) { + return YES; + } + if (processObjectIDs.count == 0) { + [self clearPendingProcessObjectIDs]; + [self stopGraph]; + if (requireProcesses && error != NULL) { + *error = XCWAudioCaptureError(20, @"No simulator audio processes are currently connected to Core Audio."); + } + return !requireProcesses; + } + [self stopGraph]; + return [self startGraphWithProcessObjectIDs:processObjectIDs error:error]; + } + + if (error != NULL) { + *error = XCWAudioCaptureError(21, @"Per-simulator audio capture requires macOS 14.2 or newer."); + } + return NO; +} + +- (BOOL)shouldApplyProcessObjectIDs:(NSArray *)processObjectIDs { + if (_pendingProcessObjectIDs != nil && [_pendingProcessObjectIDs isEqualToArray:processObjectIDs]) { + _pendingProcessObjectIDRefreshes += 1; + } else { + _pendingProcessObjectIDs = [processObjectIDs copy]; + _pendingProcessObjectIDRefreshes = 1; + } + return _pendingProcessObjectIDRefreshes >= XCWAudioProcessStableRefreshes; +} + +- (void)clearPendingProcessObjectIDs { + _pendingProcessObjectIDs = nil; + _pendingProcessObjectIDRefreshes = 0; +} + +- (BOOL)startGraphWithProcessObjectIDs:(NSArray *)processObjectIDs + error:(NSError * _Nullable __autoreleasing *)error API_AVAILABLE(macos(14.2)) { + CATapDescription *tapDescription = XCWAudioTapDescription(processObjectIDs); + OSStatus status = AudioHardwareCreateProcessTap(tapDescription, &_tapID); + if (status != noErr || _tapID == kAudioObjectUnknown) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(23, @"Create Core Audio process tap", status); + } + _tapID = kAudioObjectUnknown; + return NO; + } + + NSError *tapUIDError = nil; + NSString *tapUID = XCWAudioTapUID(_tapID, &tapUIDError); + if (tapUID.length == 0) { + if (error != NULL) { + *error = tapUIDError ?: XCWAudioCaptureError(24, @"Core Audio process tap did not expose a UID."); + } + [self stopGraph]; + return NO; + } + + NSString *aggregateUID = [NSString stringWithFormat:@"dev.simdeck.audio.%@", NSUUID.UUID.UUIDString]; + NSDictionary *aggregateDescription = @{ + XCWAudioDictionaryKey(kAudioAggregateDeviceNameKey): @"SimDeck Simulator Audio", + XCWAudioDictionaryKey(kAudioAggregateDeviceUIDKey): aggregateUID, + XCWAudioDictionaryKey(kAudioAggregateDeviceIsPrivateKey): @YES, + XCWAudioDictionaryKey(kAudioAggregateDeviceTapListKey): @[ + @{ XCWAudioDictionaryKey(kAudioSubTapUIDKey): tapUID } + ], + }; + status = AudioHardwareCreateAggregateDevice((__bridge CFDictionaryRef)aggregateDescription, &_aggregateDeviceID); + if (status != noErr || _aggregateDeviceID == kAudioObjectUnknown) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(25, @"Create Core Audio aggregate device", status); + } + [self stopGraph]; + return NO; + } + + CFArrayRef tapList = (__bridge CFArrayRef)@[ tapUID ]; + AudioObjectPropertyAddress tapListAddress = { + .mSelector = kAudioAggregateDevicePropertyTapList, + .mScope = kAudioObjectPropertyScopeGlobal, + .mElement = kAudioObjectPropertyElementMain, + }; + status = AudioObjectSetPropertyData(_aggregateDeviceID, + &tapListAddress, + 0, + NULL, + sizeof(tapList), + &tapList); + if (status != noErr) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(26, @"Attach Core Audio tap to aggregate device", status); + } + [self stopGraph]; + return NO; + } + + memset(&_streamDescription, 0, sizeof(_streamDescription)); + if (!XCWAudioGetObjectStreamFormat(_aggregateDeviceID, kAudioDevicePropertyStreamFormat, kAudioObjectPropertyScopeInput, &_streamDescription) && + !XCWAudioGetObjectStreamFormat(_tapID, kAudioTapPropertyFormat, kAudioObjectPropertyScopeGlobal, &_streamDescription)) { + if (error != NULL) { + *error = XCWAudioCaptureError(27, @"Core Audio tap did not expose a readable linear PCM format."); + } + [self stopGraph]; + return NO; + } + + status = AudioDeviceCreateIOProcID(_aggregateDeviceID, + XCWNativeAudioDeviceIOProc, + (__bridge void *)self, + &_ioProcID); + if (status != noErr || _ioProcID == NULL) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(28, @"Create Core Audio tap IOProc", status); + } + [self stopGraph]; + return NO; + } + + status = AudioDeviceStart(_aggregateDeviceID, _ioProcID); + if (status != noErr) { + if (error != NULL) { + *error = XCWAudioCaptureStatusError(29, @"Start Core Audio tap device", status); + } + [self stopGraph]; + return NO; + } + + _processObjectIDs = [processObjectIDs copy]; + [self clearPendingProcessObjectIDs]; + return YES; +} + +- (void)stopGraph { + if (_aggregateDeviceID != kAudioObjectUnknown && _ioProcID != NULL) { + AudioDeviceStop(_aggregateDeviceID, _ioProcID); + AudioDeviceDestroyIOProcID(_aggregateDeviceID, _ioProcID); + _ioProcID = NULL; + } + if (_aggregateDeviceID != kAudioObjectUnknown) { + AudioHardwareDestroyAggregateDevice(_aggregateDeviceID); + _aggregateDeviceID = kAudioObjectUnknown; + } + if (_tapID != kAudioObjectUnknown) { + AudioHardwareDestroyProcessTap(_tapID); + _tapID = kAudioObjectUnknown; + } + _processObjectIDs = @[]; + [self clearPendingProcessObjectIDs]; + memset(&_streamDescription, 0, sizeof(_streamDescription)); + [_opusEncoder invalidate]; +} + +- (void)invalidate { + _invalidated = YES; + [self stopGraph]; +} + +- (void)handleInputData:(const AudioBufferList *)inputData + inputTime:(const AudioTimeStamp *)inputTime { + if (_invalidated || _callback == NULL || inputData == NULL) { + return; + } + + AudioStreamBasicDescription streamDescription = _streamDescription; + uint32_t sampleRate = 0; + uint16_t channels = 0; + NSData *pcm = XCWPCM16InterleavedDataFromAudioBufferList(inputData, &streamDescription, &sampleRate, &channels); + if (pcm.length == 0 || sampleRate == 0 || channels == 0) { + return; + } + + NSError *encodeError = nil; + NSArray *packets = [_opusEncoder encodePCM:pcm + sampleRate:sampleRate + channels:channels + error:&encodeError]; + if (encodeError != nil) { + NSLog(@"SimDeck audio capture failed to encode Opus packet: %@", encodeError.localizedDescription); + return; + } + + uint64_t timestampUS = XCWAudioTimestampUS(inputTime); + for (NSData *packet in packets) { + if (packet.length == 0) { + continue; + } + xcw_native_audio_sample sample = { + .timestamp_us = timestampUS, + .sample_rate = XCWOpusSampleRate, + .channels = _opusEncoder.channels, + .data = XCWSharedBytesFromData(packet), + }; + _callback(&sample, _callbackUserData); + } +} + +@end + +static OSStatus XCWNativeAudioDeviceIOProc(AudioObjectID inDevice, + const AudioTimeStamp *inNow, + const AudioBufferList *inInputData, + const AudioTimeStamp *inInputTime, + AudioBufferList *outOutputData, + const AudioTimeStamp *inOutputTime, + void *inClientData) { + (void)inDevice; + (void)inNow; + (void)outOutputData; + (void)inOutputTime; + @autoreleasepool { + XCWNativeAudioCapture *capture = (__bridge XCWNativeAudioCapture *)inClientData; + [capture handleInputData:inInputData inputTime:inInputTime]; + } + return noErr; +} + static XCWNativeH264Encoder *XCWNativeH264EncoderFromHandle(void *handle) { return (__bridge XCWNativeH264Encoder *)handle; } @@ -1367,6 +2188,46 @@ void xcw_native_h264_encoder_request_keyframe(void *handle) { } } +void *xcw_native_audio_capture_create(const int32_t *process_ids, size_t process_count, xcw_native_audio_callback callback, void *user_data, char **error_message) { + @autoreleasepool { + XCWNativeAudioCapture *capture = [[XCWNativeAudioCapture alloc] initWithAudioCallback:callback + userData:user_data]; + NSError *error = nil; + BOOL ok = [capture startWithProcessIDs:process_ids count:process_count error:&error]; + if (!ok) { + XCWSetErrorMessage(error_message, error); + return NULL; + } + return (__bridge_retained void *)capture; + } +} + +bool xcw_native_audio_capture_update_processes(void *handle, const int32_t *process_ids, size_t process_count, char **error_message) { + if (handle == NULL) { + XCWSetErrorMessage(error_message, XCWAudioCaptureError(30, @"Audio capture handle is null.")); + return false; + } + @autoreleasepool { + XCWNativeAudioCapture *capture = (__bridge XCWNativeAudioCapture *)handle; + NSError *error = nil; + BOOL ok = [capture updateProcessIDs:process_ids count:process_count error:&error]; + if (!ok) { + XCWSetErrorMessage(error_message, error); + } + return ok; + } +} + +void xcw_native_audio_capture_destroy(void *handle) { + if (handle == NULL) { + return; + } + @autoreleasepool { + XCWNativeAudioCapture *capture = CFBridgingRelease(handle); + [capture invalidate]; + } +} + void xcw_native_free_string(char *value) { if (value != NULL) { free(value); diff --git a/packages/server/native_stubs.c b/packages/server/native_stubs.c index 7b3ae92a..266975b6 100644 --- a/packages/server/native_stubs.c +++ b/packages/server/native_stubs.c @@ -30,8 +30,17 @@ typedef struct { xcw_native_shared_bytes data; } xcw_native_frame; +typedef struct { + uint64_t timestamp_us; + uint32_t sample_rate; + uint16_t channels; + xcw_native_shared_bytes data; +} xcw_native_audio_sample; + typedef void (*xcw_native_frame_callback)(const xcw_native_frame *frame, void *user_data); +typedef void (*xcw_native_audio_callback)(const xcw_native_audio_sample *sample, + void *user_data); static char *xcw_strdup(const char *value) { if (value == NULL) { @@ -585,6 +594,34 @@ bool xcw_native_h264_encoder_encode_rgba(void *handle, const uint8_t *rgba, void xcw_native_h264_encoder_request_keyframe(void *handle) { (void)handle; } +void *xcw_native_audio_capture_create(const int32_t *process_ids, + uintptr_t process_count, + xcw_native_audio_callback callback, + void *user_data, + char **error_message) { + (void)process_ids; + (void)process_count; + (void)callback; + (void)user_data; + xcw_set_error(error_message, + "Audio capture is only available in the macOS native bridge."); + return NULL; +} + +bool xcw_native_audio_capture_update_processes(void *handle, + const int32_t *process_ids, + uintptr_t process_count, + char **error_message) { + (void)handle; + (void)process_ids; + (void)process_count; + xcw_set_error(error_message, + "Audio capture is only available in the macOS native bridge."); + return false; +} + +void xcw_native_audio_capture_destroy(void *handle) { (void)handle; } + void xcw_native_free_string(char *value) { free(value); } void xcw_native_free_bytes(xcw_native_owned_bytes bytes) { free(bytes.data); } diff --git a/packages/server/src/android.rs b/packages/server/src/android.rs index adc2c794..92100e1a 100644 --- a/packages/server/src/android.rs +++ b/packages/server/src/android.rs @@ -330,7 +330,6 @@ impl AndroidBridge { "-avd", &avd_name, window_mode, - "-no-audio", "-gpu", "swiftshader_indirect", ]; diff --git a/packages/server/src/native/ffi.rs b/packages/server/src/native/ffi.rs index 8a8a47b5..043df61c 100644 --- a/packages/server/src/native/ffi.rs +++ b/packages/server/src/native/ffi.rs @@ -26,10 +26,22 @@ pub struct xcw_native_frame { pub data: xcw_native_shared_bytes, } +#[repr(C)] +pub struct xcw_native_audio_sample { + pub timestamp_us: u64, + pub sample_rate: u32, + pub channels: u16, + pub data: xcw_native_shared_bytes, +} + #[allow(non_camel_case_types)] pub type xcw_native_frame_callback = unsafe extern "C" fn(frame: *const xcw_native_frame, user_data: *mut c_void); +#[allow(non_camel_case_types)] +pub type xcw_native_audio_callback = + unsafe extern "C" fn(sample: *const xcw_native_audio_sample, user_data: *mut c_void); + unsafe extern "C" { pub fn simdeck_camera_list_webcams_json(error_message: *mut *mut c_char) -> *mut c_char; pub fn simdeck_camera_start( @@ -351,6 +363,21 @@ unsafe extern "C" { ) -> bool; pub fn xcw_native_h264_encoder_request_keyframe(handle: *mut c_void); + pub fn xcw_native_audio_capture_create( + process_ids: *const i32, + process_count: usize, + callback: Option, + user_data: *mut c_void, + error_message: *mut *mut c_char, + ) -> *mut c_void; + pub fn xcw_native_audio_capture_update_processes( + handle: *mut c_void, + process_ids: *const i32, + process_count: usize, + error_message: *mut *mut c_char, + ) -> bool; + pub fn xcw_native_audio_capture_destroy(handle: *mut c_void); + pub fn xcw_native_free_string(value: *mut c_char); pub fn xcw_native_free_bytes(bytes: xcw_native_owned_bytes); pub fn xcw_native_release_shared_bytes(bytes: xcw_native_shared_bytes); diff --git a/packages/server/src/transport/webrtc.rs b/packages/server/src/transport/webrtc.rs index 2391ceda..bf774738 100644 --- a/packages/server/src/transport/webrtc.rs +++ b/packages/server/src/transport/webrtc.rs @@ -11,17 +11,18 @@ use crate::native::ffi; use crate::transport::packet::{FramePacket, SharedFrame}; use bytes::{BufMut, Bytes, BytesMut}; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; +use std::collections::{BTreeSet, HashMap, VecDeque}; use std::ffi::{c_void, CStr}; +use std::process::Command; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::Duration; -use tokio::sync::{broadcast, mpsc}; +use tokio::sync::{broadcast, mpsc, watch}; use tokio::task; use tokio::time::{self, Instant}; use tracing::{info, warn}; use webrtc::api::interceptor_registry::register_default_interceptors; -use webrtc::api::media_engine::{MediaEngine, MIME_TYPE_H264}; +use webrtc::api::media_engine::{MediaEngine, MIME_TYPE_H264, MIME_TYPE_OPUS}; use webrtc::api::APIBuilder; use webrtc::data_channel::data_channel_init::RTCDataChannelInit; use webrtc::data_channel::data_channel_message::DataChannelMessage; @@ -29,6 +30,7 @@ use webrtc::data_channel::data_channel_state::RTCDataChannelState; use webrtc::data_channel::RTCDataChannel; use webrtc::ice_transport::ice_server::RTCIceServer; use webrtc::interceptor::registry::Registry; +use webrtc::media::Sample as WebRtcSample; use webrtc::peer_connection::configuration::RTCConfiguration; use webrtc::peer_connection::peer_connection_state::RTCPeerConnectionState; use webrtc::peer_connection::policy::ice_transport_policy::RTCIceTransportPolicy; @@ -44,6 +46,7 @@ use webrtc::rtp_transceiver::rtp_codec::{ }; use webrtc::rtp_transceiver::RTCPFeedback; use webrtc::track::track_local::track_local_static_rtp::TrackLocalStaticRTP; +use webrtc::track::track_local::track_local_static_sample::TrackLocalStaticSample; use webrtc::track::track_local::TrackLocal; use webrtc::track::track_local::TrackLocalWriter; @@ -57,6 +60,7 @@ const WEBRTC_MAX_LOCAL_STREAM_FPS: u32 = 240; const WEBRTC_WRITE_TIMEOUT: Duration = Duration::from_millis(120); const WEBRTC_REALTIME_WRITE_TIMEOUT: Duration = Duration::from_millis(45); const WEBRTC_REALTIME_KEYFRAME_WRITE_TIMEOUT: Duration = Duration::from_millis(90); +const WEBRTC_AUDIO_WRITE_TIMEOUT: Duration = Duration::from_millis(120); const WEBRTC_INITIAL_KEYFRAME_TIMEOUT: Duration = Duration::from_secs(5); const WEBRTC_FAST_ICE_GATHER_TIMEOUT: Duration = Duration::from_millis(250); const WEBRTC_FULL_ICE_GATHER_TIMEOUT: Duration = Duration::from_secs(3); @@ -72,6 +76,14 @@ const ANDROID_WEBRTC_RGBA_VERSION: u8 = 1; const ANDROID_WEBRTC_RGBA_FORMAT_RGBA8888: u8 = 1; const ANDROID_WEBRTC_RGBA_BUFFERED_FRAME_LIMIT: usize = 2; const ANDROID_WEBRTC_FPS: u64 = 30; +const WEBRTC_AUDIO_PROCESS_REFRESH_INTERVAL: Duration = Duration::from_secs(1); +const WEBRTC_AUDIO_SAMPLE_RATE: u32 = 48_000; +const WEBRTC_AUDIO_CHANNELS: u16 = 2; +const WEBRTC_AUDIO_FRAME_DURATION: Duration = Duration::from_millis(20); +const WEBRTC_AUDIO_SILENCE_TIMEOUT: Duration = Duration::from_millis(18); +const WEBRTC_OPUS_SILENCE_PACKET: &[u8] = &[ + 0x28, 0x0B, 0xE4, 0x89, 0x1A, 0x2C, 0x08, 0x8A, 0xAE, 0xF8, 0x3A, 0xEC, +]; static WEBRTC_MEDIA_STREAMS: OnceLock>>> = OnceLock::new(); const MAX_WEBRTC_MEDIA_STREAMS_PER_UDID: usize = 16; @@ -100,9 +112,19 @@ pub struct WebRtcAnswerPayload { pub sdp: String, #[serde(rename = "type")] pub kind: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub audio: Option, pub video: WebRtcVideoMetadata, } +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct WebRtcAudioMetadata { + pub channels: u16, + pub codec: String, + pub sample_rate: u32, +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct WebRtcVideoMetadata { @@ -206,6 +228,7 @@ pub async fn create_answer( "WebRTC preview requires H.264. Restart SimDeck with `--video-codec auto`, `hardware`, or `software`.", )); } + let wants_audio = sdp_has_media_type(&payload.sdp, "audio"); let h264_fmtp_line = h264_sdp_fmtp_line(&codec, &payload.sdp); let mut media_engine = MediaEngine::default(); @@ -225,6 +248,9 @@ pub async fn create_answer( RTPCodecType::Video, ) .map_err(|error| AppError::internal(format!("register WebRTC H.264 codec: {error}")))?; + if wants_audio { + register_opus_audio_codec(&mut media_engine)?; + } let mut registry = Registry::new(); registry = register_default_interceptors(registry, &mut media_engine) .map_err(|error| AppError::internal(format!("register WebRTC interceptors: {error}")))?; @@ -261,6 +287,21 @@ pub async fn create_answer( ), } + let audio_track = if wants_audio { + let track = Arc::new(TrackLocalStaticSample::new( + opus_audio_codec_capability(), + "simdeck-audio".to_owned(), + "simdeck".to_owned(), + )); + let audio_sender = peer_connection + .add_track(track.clone() as Arc) + .await + .map_err(|error| AppError::internal(format!("add WebRTC audio track: {error}")))?; + tokio::spawn(async move { while audio_sender.read_rtcp().await.is_ok() {} }); + Some(track) + } else { + None + }; let video_track = Arc::new(TrackLocalStaticRTP::new( RTCRtpCodecCapability { mime_type: MIME_TYPE_H264.to_owned(), @@ -346,6 +387,7 @@ pub async fn create_answer( first_frame, peer_connection, video_track, + audio_track, cancellation_token, cancellation, stream_control_rx, @@ -356,6 +398,11 @@ pub async fn create_answer( Ok(WebRtcAnswerPayload { sdp: local_description.sdp, kind: "answer".to_owned(), + audio: wants_audio.then(|| WebRtcAudioMetadata { + channels: WEBRTC_AUDIO_CHANNELS, + codec: "opus".to_owned(), + sample_rate: WEBRTC_AUDIO_SAMPLE_RATE, + }), video: WebRtcVideoMetadata { width: first_frame_width, height: first_frame_height, @@ -368,6 +415,7 @@ async fn create_android_rgba_answer( udid: String, payload: WebRtcOfferPayload, ) -> Result { + let wants_audio = sdp_has_media_type(&payload.sdp, "audio"); let source = AndroidWebRtcSource::start( state.android.clone(), state.metrics.clone(), @@ -387,7 +435,22 @@ async fn create_android_rgba_answer( ice_transport_policy_label() ); - let api = APIBuilder::new().build(); + let api = if wants_audio { + let mut media_engine = MediaEngine::default(); + register_opus_audio_codec(&mut media_engine)?; + let mut registry = Registry::new(); + registry = register_default_interceptors(registry, &mut media_engine).map_err(|error| { + AppError::internal(format!( + "register Android RGBA WebRTC interceptors: {error}" + )) + })?; + APIBuilder::new() + .with_media_engine(media_engine) + .with_interceptor_registry(registry) + .build() + } else { + APIBuilder::new().build() + }; let peer_connection = Arc::new( api.new_peer_connection(RTCConfiguration { ice_servers: ice_servers(), @@ -417,6 +480,23 @@ async fn create_android_rgba_answer( ) .await .map_err(|error| AppError::internal(format!("create RGBA WebRTC data channel: {error}")))?; + let audio_track = if wants_audio { + let track = Arc::new(TrackLocalStaticSample::new( + opus_audio_codec_capability(), + "simdeck-audio".to_owned(), + "simdeck".to_owned(), + )); + let audio_sender = peer_connection + .add_track(track.clone() as Arc) + .await + .map_err(|error| { + AppError::internal(format!("add Android RGBA WebRTC audio track: {error}")) + })?; + tokio::spawn(async move { while audio_sender.read_rtcp().await.is_ok() {} }); + Some(track) + } else { + None + }; let fast_gather = has_sdp_candidate_type(&payload.sdp, "host") && ice_transport_policy_label() == "all"; @@ -467,6 +547,7 @@ async fn create_android_rgba_answer( source, peer_connection, rgba_channel, + audio_track, cancellation_token, cancellation, stream_control_rx, @@ -477,6 +558,11 @@ async fn create_android_rgba_answer( Ok(WebRtcAnswerPayload { sdp: local_description.sdp, kind: "answer".to_owned(), + audio: wants_audio.then(|| WebRtcAudioMetadata { + channels: WEBRTC_AUDIO_CHANNELS, + codec: "opus".to_owned(), + sample_rate: WEBRTC_AUDIO_SAMPLE_RATE, + }), video: WebRtcVideoMetadata { width: 0, height: 0, @@ -582,6 +668,13 @@ fn summarize_sdp_candidate_types(sdp: &str) -> String { format!("host={host},srflx={srflx},prflx={prflx},relay={relay},other={other}") } +fn sdp_has_media_type(sdp: &str, media_type: &str) -> bool { + let prefix = format!("m={media_type} "); + sdp.lines() + .map(str::trim_start) + .any(|line| line.starts_with(&prefix)) +} + fn redact_candidate_address(address: &str) -> String { if address.is_empty() { return String::new(); @@ -1259,6 +1352,29 @@ fn h264_rtcp_feedback() -> Vec { ] } +fn opus_audio_codec_capability() -> RTCRtpCodecCapability { + RTCRtpCodecCapability { + mime_type: MIME_TYPE_OPUS.to_owned(), + clock_rate: WEBRTC_AUDIO_SAMPLE_RATE, + channels: WEBRTC_AUDIO_CHANNELS, + sdp_fmtp_line: "minptime=10;useinbandfec=1;stereo=1;sprop-stereo=1".to_owned(), + rtcp_feedback: Vec::new(), + } +} + +fn register_opus_audio_codec(media_engine: &mut MediaEngine) -> Result<(), AppError> { + media_engine + .register_codec( + RTCRtpCodecParameters { + capability: opus_audio_codec_capability(), + payload_type: 111, + ..Default::default() + }, + RTPCodecType::Audio, + ) + .map_err(|error| AppError::internal(format!("register WebRTC Opus codec: {error}"))) +} + fn rtcp_packet_requests_keyframe(packet: &(dyn RtcpPacket + Send + Sync)) -> bool { packet.as_any().is::() || packet.as_any().is::() } @@ -1425,6 +1541,442 @@ fn ice_transport_policy() -> RTCIceTransportPolicy { } } +#[derive(Clone)] +struct SimulatorAudioCapture { + inner: Arc, +} + +struct SimulatorAudioCaptureInner { + handle: AtomicUsize, + callback_user_data: AtomicUsize, + sender: watch::Sender>, +} + +#[derive(Debug)] +struct EncodedAudioSample { + sample_rate: u32, + channels: u16, + data: Bytes, +} + +type SharedEncodedAudioSample = Arc; + +impl SimulatorAudioCapture { + fn start( + process_ids: &[i32], + sender: watch::Sender>, + ) -> Result { + if process_ids.is_empty() { + return Err(AppError::native( + "No simulator audio process IDs were available.", + )); + } + let inner = Arc::new(SimulatorAudioCaptureInner { + handle: AtomicUsize::new(0), + callback_user_data: AtomicUsize::new(0), + sender, + }); + let user_data = Weak::into_raw(Arc::downgrade(&inner)) as *mut c_void; + let mut error = std::ptr::null_mut(); + let handle = unsafe { + ffi::xcw_native_audio_capture_create( + process_ids.as_ptr(), + process_ids.len(), + Some(host_audio_capture_callback), + user_data, + &mut error, + ) + }; + if handle.is_null() { + unsafe { + let _ = Weak::from_raw(user_data as *const SimulatorAudioCaptureInner); + } + return Err(unsafe { take_native_error(error) } + .unwrap_or_else(|| AppError::native("Unable to start simulator audio capture."))); + } + inner.handle.store(handle as usize, Ordering::Release); + inner + .callback_user_data + .store(user_data as usize, Ordering::Release); + Ok(Self { inner }) + } + + fn update_processes(&self, process_ids: &[i32]) -> Result<(), AppError> { + if process_ids.is_empty() { + return Ok(()); + } + let handle = self.inner.handle.load(Ordering::Acquire); + if handle == 0 { + return Err(AppError::native( + "Simulator audio capture handle was already closed.", + )); + } + let mut error = std::ptr::null_mut(); + let ok = unsafe { + ffi::xcw_native_audio_capture_update_processes( + handle as *mut c_void, + process_ids.as_ptr(), + process_ids.len(), + &mut error, + ) + }; + if ok { + Ok(()) + } else { + Err(unsafe { take_native_error(error) }.unwrap_or_else(|| { + AppError::native("Unable to update simulator audio capture processes.") + })) + } + } +} + +impl Drop for SimulatorAudioCaptureInner { + fn drop(&mut self) { + let handle = self.handle.load(Ordering::Acquire); + let callback_user_data = self.callback_user_data.load(Ordering::Acquire); + unsafe { + if handle != 0 { + ffi::xcw_native_audio_capture_destroy(handle as *mut c_void); + } + if callback_user_data != 0 { + let _ = Weak::from_raw(callback_user_data as *const SimulatorAudioCaptureInner); + } + } + } +} + +unsafe extern "C" fn host_audio_capture_callback( + sample: *const ffi::xcw_native_audio_sample, + user_data: *mut c_void, +) { + if sample.is_null() || user_data.is_null() { + return; + } + + let weak = unsafe { Weak::from_raw(user_data as *const SimulatorAudioCaptureInner) }; + if let Some(inner) = weak.upgrade() { + unsafe { + inner.handle_audio_sample(&*sample); + } + } + let _ = Weak::into_raw(weak); +} + +impl SimulatorAudioCaptureInner { + unsafe fn handle_audio_sample(&self, sample: &ffi::xcw_native_audio_sample) { + if sample.sample_rate == 0 || sample.channels == 0 { + unsafe { + ffi::xcw_native_release_shared_bytes(sample.data); + } + return; + } + let Some(data) = (unsafe { copy_native_shared_bytes(sample.data) }) else { + return; + }; + if data.is_empty() { + return; + } + let packet = Arc::new(EncodedAudioSample { + sample_rate: sample.sample_rate, + channels: sample.channels, + data, + }); + self.sender.send_replace(Some(packet)); + } +} + +fn spawn_simulator_audio_stream( + state: AppState, + udid: String, + audio_track: Arc, + mut cancellation: broadcast::Receiver<()>, +) { + tokio::spawn(async move { + let (sample_tx, mut sample_rx) = watch::channel(None); + let (audio_stop_tx, _) = broadcast::channel(1); + let mut capture_cancellation = cancellation.resubscribe(); + let mut capture_stop = audio_stop_tx.subscribe(); + let capture_state = state.clone(); + let capture_udid = udid.clone(); + tokio::spawn(async move { + let mut capture: Option = None; + let mut refresh = time::interval(WEBRTC_AUDIO_PROCESS_REFRESH_INTERVAL); + loop { + tokio::select! { + _ = capture_cancellation.recv() => break, + _ = capture_stop.recv() => break, + _ = refresh.tick() => { + let process_ids = match resolve_simulator_audio_process_ids(capture_state.clone(), &capture_udid).await { + Ok(process_ids) => process_ids, + Err(error) => { + warn!("WebRTC audio process discovery failed for {capture_udid}: {error}"); + continue; + } + }; + if process_ids.is_empty() { + capture = None; + continue; + } + if let Some(active_capture) = capture.as_ref().cloned() { + let update_process_ids = process_ids.clone(); + let update_result = task::spawn_blocking(move || { + active_capture.update_processes(&update_process_ids) + }).await; + let update_result = match update_result { + Ok(result) => result, + Err(error) => Err(AppError::internal(format!( + "Failed to join audio capture update task: {error}" + ))), + }; + if let Err(error) = update_result { + warn!("WebRTC audio capture update failed for {capture_udid}: {error}"); + capture = None; + } + continue; + } + let tx = sample_tx.clone(); + match task::spawn_blocking(move || SimulatorAudioCapture::start(&process_ids, tx)).await { + Ok(Ok(new_capture)) => { + capture = Some(new_capture); + } + Ok(Err(error)) => { + warn!("WebRTC audio capture unavailable for {capture_udid}: {error}"); + } + Err(error) => { + warn!("WebRTC audio capture task failed for {capture_udid}: {error}"); + } + } + } + } + } + }); + let mut silence = time::interval(WEBRTC_AUDIO_FRAME_DURATION); + silence.set_missed_tick_behavior(time::MissedTickBehavior::Delay); + let mut last_audio_write_at: Option = None; + loop { + tokio::select! { + _ = cancellation.recv() => break, + _ = silence.tick() => { + if last_audio_write_at.is_some_and(|instant| instant.elapsed() < WEBRTC_AUDIO_SILENCE_TIMEOUT) { + continue; + } + match write_webrtc_audio_sample(&audio_track, Bytes::from_static(WEBRTC_OPUS_SILENCE_PACKET)).await { + Ok(true) => { + last_audio_write_at = Some(Instant::now()); + } + Ok(false) => {} + Err(error) => { + warn!("WebRTC audio write failed for {udid}: {error}"); + let _ = audio_stop_tx.send(()); + return; + } + } + } + sample = sample_rx.changed() => { + if sample.is_err() { + break; + } + let Some(sample) = sample_rx.borrow_and_update().clone() else { + continue; + }; + if sample.sample_rate != WEBRTC_AUDIO_SAMPLE_RATE || sample.channels != WEBRTC_AUDIO_CHANNELS { + warn!( + "Ignoring unexpected WebRTC Opus audio packet format for {udid}: {} Hz, {} channels", + sample.sample_rate, + sample.channels + ); + continue; + } + match write_webrtc_audio_sample(&audio_track, sample.data.clone()).await { + Ok(true) => { + last_audio_write_at = Some(Instant::now()); + } + Ok(false) => {} + Err(error) => { + warn!("WebRTC audio write failed for {udid}: {error}"); + let _ = audio_stop_tx.send(()); + return; + } + } + } + } + } + let _ = audio_stop_tx.send(()); + }); +} + +async fn write_webrtc_audio_sample( + audio_track: &TrackLocalStaticSample, + data: Bytes, +) -> Result { + let sample = WebRtcSample { + data, + duration: WEBRTC_AUDIO_FRAME_DURATION, + ..Default::default() + }; + match time::timeout( + WEBRTC_AUDIO_WRITE_TIMEOUT, + audio_track.write_sample(&sample), + ) + .await + { + Ok(Ok(())) => Ok(true), + Ok(Err(error)) => Err(error.to_string()), + Err(_) => Ok(false), + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct HostAudioProcess { + pid: i32, + parent_pid: i32, + command: String, +} + +async fn resolve_simulator_audio_process_ids( + state: AppState, + udid: &str, +) -> Result, AppError> { + let udid = udid.to_owned(); + task::spawn_blocking(move || simulator_audio_process_ids_blocking(&state, &udid)) + .await + .map_err(|error| { + AppError::internal(format!( + "Failed to join audio process discovery task: {error}" + )) + })? +} + +fn simulator_audio_process_ids_blocking( + state: &AppState, + udid: &str, +) -> Result, AppError> { + let processes = list_host_audio_processes()?; + let root_processes = if android::is_android_id(udid) { + android_audio_root_process_ids(udid, &processes)? + } else { + let bridge = state.registry.bridge().clone(); + let simulator = bridge + .simulator(udid)? + .ok_or_else(|| AppError::not_found(format!("Unknown simulator `{udid}`.")))?; + let data_path = simulator + .data_path + .as_str() + .filter(|value| !value.trim().is_empty()) + .map(ToOwned::to_owned); + ios_simulator_audio_root_process_ids(udid, data_path.as_deref(), &processes) + }; + Ok(process_tree_process_ids(&processes, root_processes)) +} + +fn list_host_audio_processes() -> Result, AppError> { + let output = Command::new("ps") + .args(["-axo", "pid=,ppid=,command="]) + .output() + .map_err(|error| AppError::native(format!("Unable to list host processes: {error}")))?; + if !output.status.success() { + return Err(AppError::native("Unable to list host processes.")); + } + Ok(String::from_utf8_lossy(&output.stdout) + .lines() + .filter_map(parse_host_audio_process_line) + .collect()) +} + +fn parse_host_audio_process_line(line: &str) -> Option { + let trimmed = line.trim(); + if trimmed.is_empty() { + return None; + } + let mut parts = trimmed.split_whitespace(); + let pid = parts.next()?.parse::().ok()?; + let parent_pid = parts.next()?.parse::().ok()?; + let command = parts.collect::>().join(" "); + if command.is_empty() { + return None; + } + Some(HostAudioProcess { + pid, + parent_pid, + command, + }) +} + +fn ios_simulator_audio_root_process_ids( + udid: &str, + data_path: Option<&str>, + processes: &[HostAudioProcess], +) -> BTreeSet { + let device_path = data_path + .and_then(|path| path.strip_suffix("/data")) + .filter(|path| !path.is_empty()); + processes + .iter() + .filter(|process| { + !is_simulator_audio_probe_process(&process.command) + && (process.command.contains(udid) + || data_path.is_some_and(|path| process.command.contains(path)) + || device_path.is_some_and(|path| process.command.contains(path))) + }) + .map(|process| process.pid) + .collect() +} + +fn android_audio_root_process_ids( + udid: &str, + processes: &[HostAudioProcess], +) -> Result, AppError> { + let avd_name = android::avd_from_id(udid)?; + let avd_arg = format!("-avd {avd_name}"); + let avd_at_arg = format!("@{avd_name}"); + let avd_dir = format!(".android/avd/{avd_name}.avd"); + Ok(processes + .iter() + .filter(|process| { + let command = process.command.as_str(); + !is_simulator_audio_probe_process(command) + && (command.contains(&avd_arg) + || command.contains(&avd_at_arg) + || command.contains(&avd_dir)) + }) + .map(|process| process.pid) + .collect()) +} + +fn process_tree_process_ids(processes: &[HostAudioProcess], roots: BTreeSet) -> Vec { + let mut by_parent: HashMap> = HashMap::new(); + for process in processes { + by_parent + .entry(process.parent_pid) + .or_default() + .push(process.pid); + } + + let mut selected = roots; + let mut stack = selected.iter().copied().collect::>(); + while let Some(parent_pid) = stack.pop() { + if let Some(children) = by_parent.get(&parent_pid) { + for child_pid in children { + if selected.insert(*child_pid) { + stack.push(*child_pid); + } + } + } + } + selected.into_iter().collect() +} + +fn is_simulator_audio_probe_process(command: &str) -> bool { + let executable = command + .split_whitespace() + .next() + .and_then(|value| value.rsplit('/').next()) + .unwrap_or_default(); + executable == "simctl" + || executable == "xcrun" && command.contains(" simctl ") + || executable == "ps" +} + #[derive(Clone)] pub(crate) struct AndroidWebRtcSource { inner: Arc, @@ -1904,6 +2456,7 @@ struct WebRtcMediaStream { first_frame: SharedFrame, peer_connection: Arc, video_track: Arc, + audio_track: Option>, cancellation_token: broadcast::Sender<()>, cancellation: broadcast::Receiver<()>, stream_control_rx: mpsc::UnboundedReceiver, @@ -1915,6 +2468,7 @@ struct WebRtcRgbaStream { udid: String, peer_connection: Arc, rgba_channel: Arc, + audio_track: Option>, cancellation_token: broadcast::Sender<()>, cancellation: broadcast::Receiver<()>, stream_control_rx: mpsc::UnboundedReceiver, @@ -1928,6 +2482,7 @@ impl WebRtcRgbaStream { udid, peer_connection, rgba_channel, + audio_track, cancellation_token, mut cancellation, mut stream_control_rx, @@ -1937,6 +2492,14 @@ impl WebRtcRgbaStream { let mut peer_disconnected_since: Option = None; let mut sequence = 0u64; let _guard = WebRtcMetricsGuard::new(state.metrics.clone()); + if let Some(audio_track) = audio_track { + spawn_simulator_audio_stream( + state.clone(), + udid.clone(), + audio_track, + cancellation_token.subscribe(), + ); + } rgba_channel.on_open(Box::new({ let udid = udid.clone(); move || { @@ -2048,6 +2611,7 @@ impl WebRtcMediaStream { first_frame, peer_connection, video_track, + audio_track, cancellation_token, mut cancellation, mut stream_control_rx, @@ -2067,6 +2631,14 @@ impl WebRtcMediaStream { let mut waiting_for_keyframe = false; let mut peer_disconnected_since: Option = None; let _guard = WebRtcMetricsGuard::new(state.metrics.clone()); + if let Some(audio_track) = audio_track { + spawn_simulator_audio_stream( + state.clone(), + udid.clone(), + audio_track, + cancellation_token.subscribe(), + ); + } let first_frame_duration = send_timing.duration_for(&first_frame, realtime_stream); match write_frame_sample_with_timeout( @@ -2645,10 +3217,12 @@ mod tests { use super::{ android_rgba_webrtc_frame_chunks, append_avcc_parameter_sets, append_length_prefixed_nalus, h264_annex_b_sample, h264_frame_has_idr, h264_frame_is_decoder_sync, h264_sdp_fmtp_line, - is_annex_b, is_h264_codec, rtcp_packet_requests_keyframe, rtp_packet_pacing, - WebRtcMetricsGuard, WebRtcSendTiming, ANDROID_WEBRTC_RGBA_CHUNK_BYTES, - ANDROID_WEBRTC_RGBA_CHUNK_HEADER_BYTES, ANDROID_WEBRTC_RGBA_CHUNK_MAGIC, - ANDROID_WEBRTC_RGBA_FORMAT_RGBA8888, ANDROID_WEBRTC_RGBA_VERSION, ANNEX_B_START_CODE, + is_annex_b, is_h264_codec, opus_audio_codec_capability, rtcp_packet_requests_keyframe, + rtp_packet_pacing, sdp_has_media_type, WebRtcMetricsGuard, WebRtcSendTiming, + ANDROID_WEBRTC_RGBA_CHUNK_BYTES, ANDROID_WEBRTC_RGBA_CHUNK_HEADER_BYTES, + ANDROID_WEBRTC_RGBA_CHUNK_MAGIC, ANDROID_WEBRTC_RGBA_FORMAT_RGBA8888, + ANDROID_WEBRTC_RGBA_VERSION, ANNEX_B_START_CODE, WEBRTC_AUDIO_CHANNELS, + WEBRTC_AUDIO_SAMPLE_RATE, WEBRTC_OPUS_SILENCE_PACKET, }; use crate::android; use crate::metrics::counters::Metrics; @@ -2705,6 +3279,110 @@ mod tests { assert!(!rtcp_packet_requests_keyframe(&SenderReport::default())); } + #[test] + fn detects_audio_m_lines_in_browser_offers() { + assert!(sdp_has_media_type( + "v=0\r\nm=audio 9 UDP/TLS/RTP/SAVPF 0\r\nm=video 9 UDP/TLS/RTP/SAVPF 96\r\n", + "audio" + )); + assert!(!sdp_has_media_type( + "v=0\r\nm=video 9 UDP/TLS/RTP/SAVPF 96\r\n", + "audio" + )); + } + + #[test] + fn opus_audio_codec_uses_browser_native_wideband_settings() { + let codec = opus_audio_codec_capability(); + + assert_eq!(codec.mime_type, "audio/opus"); + assert_eq!(codec.clock_rate, WEBRTC_AUDIO_SAMPLE_RATE); + assert_eq!(codec.channels, WEBRTC_AUDIO_CHANNELS); + assert!(codec.sdp_fmtp_line.contains("stereo=1")); + assert!(codec.sdp_fmtp_line.contains("useinbandfec=1")); + } + + #[test] + fn opus_silence_packet_uses_real_low_bitrate_audio_frame() { + assert_eq!(WEBRTC_OPUS_SILENCE_PACKET.len(), 12); + assert_ne!(WEBRTC_OPUS_SILENCE_PACKET, &[0xF8, 0xFF, 0xFE]); + } + + #[test] + fn parses_host_audio_process_lines_with_commands_containing_spaces() { + assert_eq!( + super::parse_host_audio_process_line(" 42 1 /tmp/My App.app/My App --flag value"), + Some(super::HostAudioProcess { + pid: 42, + parent_pid: 1, + command: "/tmp/My App.app/My App --flag value".to_owned(), + }) + ); + } + + #[test] + fn ios_audio_process_discovery_includes_device_descendants() { + let processes = vec![ + super::HostAudioProcess { + pid: 10, + parent_pid: 1, + command: "/Library/Developer/CoreSimulator/Profiles/Runtimes/iOS.simruntime/Contents/Resources/RuntimeRoot/usr/libexec/launchd_sim /Users/me/Library/Developer/CoreSimulator/Devices/UDID-1/data" + .to_owned(), + }, + super::HostAudioProcess { + pid: 11, + parent_pid: 10, + command: "/Applications/Fixture.app/Fixture".to_owned(), + }, + super::HostAudioProcess { + pid: 12, + parent_pid: 1, + command: "/usr/bin/xcrun simctl spawn UDID-1 launchctl print user/501" + .to_owned(), + }, + ]; + let roots = super::ios_simulator_audio_root_process_ids( + "UDID-1", + Some("/Users/me/Library/Developer/CoreSimulator/Devices/UDID-1/data"), + &processes, + ); + + assert_eq!( + super::process_tree_process_ids(&processes, roots), + vec![10, 11] + ); + } + + #[test] + fn android_audio_process_discovery_includes_emulator_descendants() { + let processes = vec![ + super::HostAudioProcess { + pid: 20, + parent_pid: 1, + command: + "/Users/me/Library/Android/sdk/emulator/emulator -avd Pixel_8_API_36 -no-window" + .to_owned(), + }, + super::HostAudioProcess { + pid: 21, + parent_pid: 20, + command: "qemu-system-aarch64 -some-child-arg".to_owned(), + }, + super::HostAudioProcess { + pid: 22, + parent_pid: 1, + command: "/Users/me/Library/Android/sdk/emulator/emulator -avd Other".to_owned(), + }, + ]; + let roots = + super::android_audio_root_process_ids("android:Pixel_8_API_36", &processes).unwrap(); + + assert_eq!( + super::process_tree_process_ids(&processes, roots), + vec![20, 21] + ); + } + #[test] fn realtime_h264_advertises_retransmission_feedback() { let feedback = super::h264_rtcp_feedback(); diff --git a/scripts/e2e-webrtc-reliability.mjs b/scripts/e2e-webrtc-reliability.mjs index 6d161d1c..de3f500b 100644 --- a/scripts/e2e-webrtc-reliability.mjs +++ b/scripts/e2e-webrtc-reliability.mjs @@ -28,6 +28,9 @@ const maxInteractionLatencyMs = Number( process.env.SIMDECK_E2E_MAX_INTERACTION_LATENCY_MS ?? 750, ); const interactionsEnabled = process.env.SIMDECK_E2E_INTERACTIONS !== "0"; +const audioEnabled = process.env.SIMDECK_E2E_ENABLE_AUDIO === "1"; +const capturePeerSnapshot = + process.env.SIMDECK_E2E_CAPTURE_PEER_SNAPSHOT === "1"; const maxPeerDisconnectedMs = Number( process.env.SIMDECK_E2E_MAX_PEER_DISCONNECTED_MS ?? 1000, ); @@ -41,6 +44,7 @@ const minVideoHeight = Number(process.env.SIMDECK_E2E_MIN_VIDEO_HEIGHT ?? 0); const minDecodedFps = Number(process.env.SIMDECK_E2E_MIN_DECODED_FPS ?? 0); const minPresentedFps = Number(process.env.SIMDECK_E2E_MIN_PRESENTED_FPS ?? 0); const minReceivedFps = Number(process.env.SIMDECK_E2E_MIN_RECEIVED_FPS ?? 0); +const minAudioPackets = Number(process.env.SIMDECK_E2E_MIN_AUDIO_PACKETS ?? 0); const visualSampleIntervalMs = Number( process.env.SIMDECK_E2E_VISUAL_SAMPLE_INTERVAL_MS ?? 5000, ); @@ -559,12 +563,20 @@ try { if (warmupMs > 0) { await sleep(warmupMs); } + let streamAudioEnabled = false; + if (audioEnabled) { + streamAudioEnabled = await enableStreamAudio(cdp); + await sleep(500); + } const initialMetrics = await fetchJson(endpoint("/api/metrics")); const initialStreams = findClientStreams(initialMetrics, clientId); const initialPage = latestByKind(initialStreams, "page") ?? {}; const initialWebRtc = latestByKind(initialStreams, "webrtc") ?? {}; const directStatsStart = await collectDirectWebRtcStats(cdp); + const peerConnectionStart = capturePeerSnapshot + ? await collectPeerConnectionSnapshot(cdp) + : []; let maxObservedFrameGapMs = 0; let maxObservedDecodeQueue = 0; @@ -652,6 +664,9 @@ try { const finalPage = latestByKind(finalStreams, "page") ?? {}; const finalWebRtc = latestByKind(finalStreams, "webrtc") ?? {}; const directStatsEnd = await collectDirectWebRtcStats(cdp); + const peerConnectionEnd = capturePeerSnapshot + ? await collectPeerConnectionSnapshot(cdp) + : []; const failures = []; const renderedDelta = @@ -673,6 +688,8 @@ try { directStatsEnd.packetsReceived - directStatsStart.packetsReceived; const directPresentedDelta = directStatsEnd.totalVideoFrames - directStatsStart.totalVideoFrames; + const directAudioPacketsDelta = + directStatsEnd.audioPacketsReceived - directStatsStart.audioPacketsReceived; const observedDurationSeconds = Math.max( 0.001, (directStatsEnd.timestampMs - directStatsStart.timestampMs) / 1000, @@ -738,6 +755,11 @@ try { `received packet fps ${receivedFps.toFixed(2)} did not meet minimum ${minReceivedFps}`, ); } + if (minAudioPackets > 0 && directAudioPacketsDelta < minAudioPackets) { + failures.push( + `audio RTP packets ${directAudioPacketsDelta} did not meet minimum ${minAudioPackets}`, + ); + } if (maxPeerDisconnectedObservedMs > maxPeerDisconnectedMs) { failures.push( `peer disconnected for ${maxPeerDisconnectedObservedMs}ms, exceeded ${maxPeerDisconnectedMs}ms`, @@ -794,6 +816,7 @@ try { initialPage, initialWebRtc, observedDurationSeconds, + ...(capturePeerSnapshot ? { peerConnectionEnd, peerConnectionStart } : {}), decodedFps, presentedFps, receivedFps, @@ -803,8 +826,11 @@ try { maxPeerDisconnectedObservedMs, maxInteractionLatencyMs, maxDecoderDrops, + minAudioPackets, warmupMs, interactionsEnabled, + audioEnabled, + streamAudioEnabled, visualSamplesEnabled, interactionLatencies, presentedInteractionLatencies, @@ -823,6 +849,7 @@ try { renderedDelta, decodedDelta, receivedDelta, + directAudioPacketsDelta, droppedDelta, reconnectDelta, streams: finalStreams.map((stream) => ({ @@ -853,6 +880,27 @@ try { await rm(profileDir, { force: true, recursive: true }); } +async function enableStreamAudio(cdp) { + return evaluate( + cdp, + ` + (async () => { + const menuButton = [...document.querySelectorAll("button")] + .find((button) => button.title === "Open menu" || button.getAttribute("aria-label") === "Open menu" || button.textContent?.trim() === "Open menu"); + menuButton?.click(); + await new Promise((resolve) => requestAnimationFrame(() => resolve())); + const soundLabel = [...document.querySelectorAll("label")] + .find((label) => label.textContent?.trim() === "Sound"); + const input = soundLabel?.querySelector("input[type='checkbox']"); + if (input && !input.checked) { + input.click(); + } + return Boolean(input?.checked); + })() + `, + ); +} + async function writeSummary(summary) { if (!outputJsonPath) { return; @@ -973,6 +1021,12 @@ async function collectDirectWebRtcStats(cdp) { ` (async () => { const totals = { + audioBytesReceived: 0, + audioConcealedSamples: 0, + audioJitter: 0, + audioPacketsLost: 0, + audioPacketsReceived: 0, + audioSilentConcealedSamples: 0, framesDecoded: 0, framesDropped: 0, jitter: 0, @@ -986,6 +1040,14 @@ async function collectDirectWebRtcStats(cdp) { for (const pc of window.__simdeckPeerConnections || []) { const reports = await pc.getStats(); for (const report of reports.values()) { + if (report.type === "inbound-rtp" && (report.kind === "audio" || report.mediaType === "audio")) { + totals.audioBytesReceived += report.bytesReceived || 0; + totals.audioConcealedSamples += report.concealedSamples || 0; + totals.audioJitter = Math.max(totals.audioJitter, report.jitter || 0); + totals.audioPacketsLost += report.packetsLost || 0; + totals.audioPacketsReceived += report.packetsReceived || 0; + totals.audioSilentConcealedSamples += report.silentConcealedSamples || 0; + } if (report.type === "inbound-rtp" && (report.kind === "video" || report.mediaType === "video")) { totals.framesDecoded += report.framesDecoded || 0; totals.framesDropped += report.framesDropped || 0; @@ -1007,6 +1069,66 @@ async function collectDirectWebRtcStats(cdp) { ); } +async function collectPeerConnectionSnapshot(cdp) { + return evaluate( + cdp, + ` + (() => { + const sectionSummaries = (sdp) => String(sdp || "") + .split(/\\r?\\nm=/) + .map((section, index) => { + const text = index === 0 ? section : "m=" + section; + const lines = text.split(/\\r?\\n/).filter(Boolean); + return lines.filter((line) => + line.startsWith("m=") || + line.startsWith("a=mid:") || + line === "a=sendonly" || + line === "a=recvonly" || + line === "a=sendrecv" || + line === "a=inactive" || + line.startsWith("a=rtpmap:") || + line.startsWith("a=fmtp:") || + line.startsWith("a=ssrc:") || + line.startsWith("a=msid:") + ); + }); + return (window.__simdeckPeerConnections || []).map((pc) => ({ + connectionState: pc.connectionState, + iceConnectionState: pc.iceConnectionState, + localDescription: { + sections: sectionSummaries(pc.localDescription?.sdp || ""), + type: pc.localDescription?.type || "", + }, + remoteDescription: { + sections: sectionSummaries(pc.remoteDescription?.sdp || ""), + type: pc.remoteDescription?.type || "", + }, + signalingState: pc.signalingState, + transceivers: pc.getTransceivers().map((transceiver) => ({ + currentDirection: transceiver.currentDirection || "", + direction: transceiver.direction, + mid: transceiver.mid, + receiverTrack: transceiver.receiver?.track ? { + enabled: transceiver.receiver.track.enabled, + id: transceiver.receiver.track.id, + kind: transceiver.receiver.track.kind, + muted: transceiver.receiver.track.muted, + readyState: transceiver.receiver.track.readyState, + } : null, + senderTrack: transceiver.sender?.track ? { + enabled: transceiver.sender.track.enabled, + id: transceiver.sender.track.id, + kind: transceiver.sender.track.kind, + muted: transceiver.sender.track.muted, + readyState: transceiver.sender.track.readyState, + } : null, + })), + })); + })() + `, + ); +} + async function waitForDecodedFrameAfterInteraction( cdp, baselineFramesDecoded,