diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 6707f8194b..480db9132d 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -50,8 +50,13 @@ from sentry_sdk.tracing import Span from sentry_sdk._types import TextPart - from openai.types.responses import ResponseInputParam, SequenceNotStr - from openai.types.responses import ResponseStreamEvent + from openai.types.responses.response_usage import ResponseUsage + from openai.types.responses import ( + ResponseInputParam, + SequenceNotStr, + ResponseStreamEvent, + ) + from openai.types import CompletionUsage from openai import Omit try: @@ -144,44 +149,56 @@ def _capture_exception(exc: "Any", manual_span_cleanup: bool = True) -> None: sentry_sdk.capture_event(event, hint=hint) -def _get_usage(usage: "Any", names: "List[str]") -> int: - for name in names: - if hasattr(usage, name) and isinstance(getattr(usage, name), int): - return getattr(usage, name) - return 0 +def _has_attr_and_is_int( + token_usage: "Union[CompletionUsage, ResponseUsage]", attr_name: str +) -> bool: + return hasattr(token_usage, attr_name) and isinstance( + getattr(token_usage, attr_name, None), int + ) -def _calculate_token_usage( +def _calculate_completions_token_usage( messages: "Optional[Iterable[ChatCompletionMessageParam]]", response: "Any", span: "Span", streaming_message_responses: "Optional[List[str]]", + streaming_message_total_token_usage: "Optional[CompletionUsage]", count_tokens: "Callable[..., Any]", ) -> None: + """Extract and record token usage from a Chat Completions API response.""" input_tokens: "Optional[int]" = 0 input_tokens_cached: "Optional[int]" = 0 output_tokens: "Optional[int]" = 0 output_tokens_reasoning: "Optional[int]" = 0 total_tokens: "Optional[int]" = 0 - - if hasattr(response, "usage"): - input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"]) - if hasattr(response.usage, "input_tokens_details"): - input_tokens_cached = _get_usage( - response.usage.input_tokens_details, ["cached_tokens"] + usage = None + + if streaming_message_total_token_usage is not None: + usage = streaming_message_total_token_usage + elif hasattr(response, "usage"): + usage = response.usage + + if usage is not None: + if _has_attr_and_is_int(usage, "prompt_tokens"): + input_tokens = usage.prompt_tokens + if _has_attr_and_is_int(usage, "completion_tokens"): + output_tokens = usage.completion_tokens + if _has_attr_and_is_int(usage, "total_tokens"): + total_tokens = usage.total_tokens + + if hasattr(usage, "prompt_tokens_details"): + cached = getattr(usage.prompt_tokens_details, "cached_tokens", None) + if isinstance(cached, int): + input_tokens_cached = cached + + if hasattr(usage, "completion_tokens_details"): + reasoning = getattr( + usage.completion_tokens_details, "reasoning_tokens", None ) + if isinstance(reasoning, int): + output_tokens_reasoning = reasoning - output_tokens = _get_usage( - response.usage, ["output_tokens", "completion_tokens"] - ) - if hasattr(response.usage, "output_tokens_details"): - output_tokens_reasoning = _get_usage( - response.usage.output_tokens_details, ["reasoning_tokens"] - ) - - total_tokens = _get_usage(response.usage, ["total_tokens"]) - - # Manually count tokens + # Manually count input tokens if input_tokens == 0: for message in messages or []: if isinstance(message, str): @@ -191,11 +208,11 @@ def _calculate_token_usage( message_content = message.get("content") if message_content is None: continue - # Deliberate use of Completions function for both Completions and Responses input format. text_items = _get_text_items(message_content) input_tokens += sum(count_tokens(text) for text in text_items) continue + # Manually count output tokens if output_tokens == 0: if streaming_message_responses is not None: for message in streaming_message_responses: @@ -222,6 +239,84 @@ def _calculate_token_usage( ) +def _calculate_responses_token_usage( + input: "Any", + response: "Any", + span: "Span", + streaming_message_responses: "Optional[List[str]]", + count_tokens: "Callable[..., Any]", +) -> None: + """Extract and record token usage from a Responses API response.""" + input_tokens: "Optional[int]" = 0 + input_tokens_cached: "Optional[int]" = 0 + output_tokens: "Optional[int]" = 0 + output_tokens_reasoning: "Optional[int]" = 0 + total_tokens: "Optional[int]" = 0 + + if hasattr(response, "usage"): + usage = response.usage + + if _has_attr_and_is_int(usage, "input_tokens"): + input_tokens = usage.input_tokens + if _has_attr_and_is_int(usage, "output_tokens"): + output_tokens = usage.output_tokens + if _has_attr_and_is_int(usage, "total_tokens"): + total_tokens = usage.total_tokens + + if hasattr(usage, "input_tokens_details"): + cached = getattr(usage.input_tokens_details, "cached_tokens", None) + if isinstance(cached, int): + input_tokens_cached = cached + + if hasattr(usage, "output_tokens_details"): + reasoning = getattr(usage.output_tokens_details, "reasoning_tokens", None) + if isinstance(reasoning, int): + output_tokens_reasoning = reasoning + + # Manually count input tokens + if input_tokens == 0: + for message in input or []: + if isinstance(message, str): + input_tokens += count_tokens(message) + continue + elif isinstance(message, dict): + message_content = message.get("content") + if message_content is None: + continue + # Deliberate use of Completions function for both Completions and Responses input format. + text_items = _get_text_items(message_content) + input_tokens += sum(count_tokens(text) for text in text_items) + continue + + # Manually count output tokens + if output_tokens == 0: + if streaming_message_responses is not None: + for message in streaming_message_responses: + output_tokens += count_tokens(message) + elif hasattr(response, "output"): + for output_item in response.output: + if hasattr(output_item, "content"): + for content_item in output_item.content: + if hasattr(content_item, "text"): + output_tokens += count_tokens(content_item.text) + + # Do not set token data if it is 0 + input_tokens = input_tokens or None + input_tokens_cached = input_tokens_cached or None + output_tokens = output_tokens or None + output_tokens_reasoning = output_tokens_reasoning or None + total_tokens = total_tokens or None + + record_token_usage( + span, + input_tokens=input_tokens, + input_tokens_cached=input_tokens_cached, + output_tokens=output_tokens, + output_tokens_reasoning=output_tokens_reasoning, + total_tokens=total_tokens, + ) + + def _set_responses_api_input_data( span: "Span", kwargs: "dict[str, Any]", @@ -486,6 +581,7 @@ def _set_common_output_data( if hasattr(response, "model"): set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model) + # Chat Completions API if hasattr(response, "choices"): if should_send_default_pii() and integration.include_prompts: response_text = [ @@ -496,11 +592,19 @@ def _set_common_output_data( if len(response_text) > 0: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_text) - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_completions_token_usage( + messages=input, + response=response, + span=span, + streaming_message_responses=None, + streaming_message_total_token_usage=None, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) + # Responses API elif hasattr(response, "output"): if should_send_default_pii() and integration.include_prompts: output_messages: "dict[str, list[Any]]" = { @@ -532,12 +636,26 @@ def _set_common_output_data( span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_messages["response"] ) - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_responses_token_usage( + input=input, + response=response, + span=span, + streaming_message_responses=None, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) + # Embeddings API (fallback for responses with neither choices nor output) else: - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_completions_token_usage( + messages=input, + response=response, + span=span, + streaming_message_responses=None, + streaming_message_total_token_usage=None, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) @@ -655,6 +773,7 @@ def _wrap_synchronous_completions_chunk_iterator( """ ttft = None data_buf: "list[list[str]]" = [] # one for each choice + streaming_message_total_token_usage = None for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) @@ -671,6 +790,8 @@ def _wrap_synchronous_completions_chunk_iterator( data_buf.append([]) data_buf[choice_index].append(content or "") choice_index += 1 + if hasattr(x, "usage"): + streaming_message_total_token_usage = x.usage yield x @@ -679,17 +800,20 @@ def _wrap_synchronous_completions_chunk_iterator( set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft ) + all_responses = None if len(data_buf) > 0: all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) - _calculate_token_usage( - messages, - response, - span, - all_responses, - integration.count_tokens, - ) + + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=all_responses, + streaming_message_total_token_usage=streaming_message_total_token_usage, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) @@ -711,6 +835,7 @@ async def _wrap_asynchronous_completions_chunk_iterator( """ ttft = None data_buf: "list[list[str]]" = [] # one for each choice + streaming_message_total_token_usage = None async for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) @@ -727,6 +852,8 @@ async def _wrap_asynchronous_completions_chunk_iterator( data_buf.append([]) data_buf[choice_index].append(content or "") choice_index += 1 + if hasattr(x, "usage"): + streaming_message_total_token_usage = x.usage yield x @@ -735,17 +862,20 @@ async def _wrap_asynchronous_completions_chunk_iterator( set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft ) + all_responses = None if len(data_buf) > 0: all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) - _calculate_token_usage( - messages, - response, - span, - all_responses, - integration.count_tokens, - ) + + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=all_responses, + streaming_message_total_token_usage=streaming_message_total_token_usage, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) @@ -781,12 +911,12 @@ def _wrap_synchronous_responses_event_iterator( if isinstance(x, ResponseCompletedEvent): span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.response.model) - _calculate_token_usage( - input, - x.response, - span, - None, - integration.count_tokens, + _calculate_responses_token_usage( + input=input, + response=x.response, + span=span, + streaming_message_responses=None, + count_tokens=integration.count_tokens, ) count_tokens_manually = False @@ -801,13 +931,14 @@ def _wrap_synchronous_responses_event_iterator( all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) + if count_tokens_manually: - _calculate_token_usage( - input, - response, - span, - all_responses, - integration.count_tokens, + _calculate_responses_token_usage( + input=input, + response=response, + span=span, + streaming_message_responses=all_responses, + count_tokens=integration.count_tokens, ) if finish_span: @@ -844,12 +975,12 @@ async def _wrap_asynchronous_responses_event_iterator( if isinstance(x, ResponseCompletedEvent): span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.response.model) - _calculate_token_usage( - input, - x.response, - span, - None, - integration.count_tokens, + _calculate_responses_token_usage( + input=input, + response=x.response, + span=span, + streaming_message_responses=None, + count_tokens=integration.count_tokens, ) count_tokens_manually = False @@ -865,12 +996,12 @@ async def _wrap_asynchronous_responses_event_iterator( if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) if count_tokens_manually: - _calculate_token_usage( - input, - response, - span, - all_responses, - integration.count_tokens, + _calculate_responses_token_usage( + input=input, + response=response, + span=span, + streaming_message_responses=all_responses, + count_tokens=integration.count_tokens, ) if finish_span: span.__exit__(None, None, None) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 0fd049e742..ada2e633de 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -44,7 +44,8 @@ from sentry_sdk.consts import SPANDATA, OP from sentry_sdk.integrations.openai import ( OpenAIIntegration, - _calculate_token_usage, + _calculate_completions_token_usage, + _calculate_responses_token_usage, ) from sentry_sdk.utils import safe_serialize @@ -610,6 +611,304 @@ def test_streaming_chat_completion_no_prompts( pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) +def test_streaming_chat_completion_with_usage_in_stream( + sentry_init, + capture_events, + get_model_response, + server_side_event_chunks, +): + """When stream_options=include_usage is set, token usage comes from the final chunk's usage field.""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = OpenAI(api_key="z") + returned_stream = get_model_response( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="hel"), + finish_reason=None, + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="lo"), + finish_reason="stop", + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=10, + total_tokens=30, + ), + ), + ], + include_event_type=False, + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 + + +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) +def test_streaming_chat_completion_empty_content_preserves_token_usage( + sentry_init, + capture_events, + get_model_response, + server_side_event_chunks, +): + """Token usage from the stream is recorded even when no content is produced (e.g. content filter).""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = OpenAI(api_key="z") + returned_stream = get_model_response( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=0, + total_tokens=20, + ), + ), + ], + include_event_type=False, + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["data"] + assert span["data"]["gen_ai.usage.total_tokens"] == 20 + + +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) +@pytest.mark.asyncio +async def test_streaming_chat_completion_empty_content_preserves_token_usage_async( + sentry_init, + capture_events, + get_model_response, + async_iterator, + server_side_event_chunks, +): + """Token usage from the stream is recorded even when no content is produced - async variant.""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = AsyncOpenAI(api_key="z") + returned_stream = get_model_response( + async_iterator( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=0, + total_tokens=20, + ), + ), + ], + include_event_type=False, + ) + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + async for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["data"] + assert span["data"]["gen_ai.usage.total_tokens"] == 20 + + +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) +@pytest.mark.asyncio +async def test_streaming_chat_completion_async_with_usage_in_stream( + sentry_init, + capture_events, + get_model_response, + async_iterator, + server_side_event_chunks, +): + """When stream_options=include_usage is set, token usage comes from the final chunk's usage field (async).""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = AsyncOpenAI(api_key="z") + returned_stream = get_model_response( + async_iterator( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="hel"), + finish_reason=None, + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="lo"), + finish_reason="stop", + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=10, + total_tokens=30, + ), + ), + ], + include_event_type=False, + ) + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + async for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 + + # noinspection PyTypeChecker @pytest.mark.parametrize( "messages", @@ -1780,7 +2079,8 @@ async def test_span_origin_embeddings_async(sentry_init, capture_events): assert event["spans"][0]["origin"] == "auto.ai.openai" -def test_calculate_token_usage_a(): +def test_completions_token_usage_from_response(): + """Token counts are extracted from response.usage using Completions API field names.""" span = mock.MagicMock() def count_tokens(msg): @@ -1797,8 +2097,13 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_total_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, @@ -1810,7 +2115,46 @@ def count_tokens(msg): ) -def test_calculate_token_usage_b(): +def test_completions_token_usage_with_detailed_fields(): + """Cached and reasoning token counts are extracted from prompt_tokens_details and completion_tokens_details.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = mock.MagicMock() + response.usage.prompt_tokens = 20 + response.usage.prompt_tokens_details = mock.MagicMock() + response.usage.prompt_tokens_details.cached_tokens = 5 + response.usage.completion_tokens = 10 + response.usage.completion_tokens_details = mock.MagicMock() + response.usage.completion_tokens_details.reasoning_tokens = 8 + response.usage.total_tokens = 30 + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_completions_token_usage( + messages=[], + response=response, + span=span, + streaming_message_responses=[], + streaming_message_total_token_usage=None, + count_tokens=count_tokens, + ) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=5, + output_tokens=10, + output_tokens_reasoning=8, + total_tokens=30, + ) + + +def test_completions_token_usage_manual_input_counting(): + """When prompt_tokens is missing, input tokens are counted manually from messages.""" span = mock.MagicMock() def count_tokens(msg): @@ -1830,8 +2174,13 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_total_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, @@ -1843,7 +2192,8 @@ def count_tokens(msg): ) -def test_calculate_token_usage_c(): +def test_completions_token_usage_manual_output_counting_streaming(): + """When completion_tokens is missing, output tokens are counted from streaming responses.""" span = mock.MagicMock() def count_tokens(msg): @@ -1863,8 +2213,13 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_total_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, @@ -1876,7 +2231,8 @@ def count_tokens(msg): ) -def test_calculate_token_usage_d(): +def test_completions_token_usage_manual_output_counting_choices(): + """When completion_tokens is missing, output tokens are counted from response.choices.""" span = mock.MagicMock() def count_tokens(msg): @@ -1887,30 +2243,48 @@ def count_tokens(msg): response.usage.prompt_tokens = 20 response.usage.total_tokens = 20 response.choices = [ - mock.MagicMock(message="one"), - mock.MagicMock(message="two"), - mock.MagicMock(message="three"), + Choice( + index=0, + finish_reason="stop", + message=ChatCompletionMessage(role="assistant", content="one"), + ), + Choice( + index=1, + finish_reason="stop", + message=ChatCompletionMessage(role="assistant", content="two"), + ), + Choice( + index=2, + finish_reason="stop", + message=ChatCompletionMessage(role="assistant", content="three"), + ), ] messages = [] - streaming_message_responses = [] + streaming_message_responses = None with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_total_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, input_tokens=20, input_tokens_cached=None, - output_tokens=None, + output_tokens=11, output_tokens_reasoning=None, total_tokens=20, ) -def test_calculate_token_usage_e(): +def test_completions_token_usage_no_usage_data(): + """When response has no usage data and no streaming responses, all tokens are None.""" span = mock.MagicMock() def count_tokens(msg): @@ -1923,8 +2297,75 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_total_token_usage=None, + count_tokens=count_tokens, + ) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=None, + input_tokens_cached=None, + output_tokens=None, + output_tokens_reasoning=None, + total_tokens=None, + ) + + +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_token_usage_from_response(): + """Token counts including cached and reasoning tokens are extracted from Responses API.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = mock.MagicMock() + response.usage.input_tokens = 20 + response.usage.input_tokens_details = mock.MagicMock() + response.usage.input_tokens_details.cached_tokens = 5 + response.usage.output_tokens = 10 + response.usage.output_tokens_details = mock.MagicMock() + response.usage.output_tokens_details.reasoning_tokens = 8 + response.usage.total_tokens = 30 + input = [] + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_responses_token_usage(input, response, span, None, count_tokens) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=5, + output_tokens=10, + output_tokens_reasoning=8, + total_tokens=30, + ) + + +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_token_usage_no_usage_data(): + """When Responses API response has no usage data, all tokens are None.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = None + input = [] + streaming_message_responses = None + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_responses_token_usage( + input, response, span, streaming_message_responses, count_tokens ) mock_record_token_usage.assert_called_once_with( span, @@ -1936,6 +2377,70 @@ def count_tokens(msg): ) +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_token_usage_manual_output_counting_response_output(): + """When output_tokens is missing, output tokens are counted from response.output.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = mock.MagicMock() + response.usage.input_tokens = 20 + response.usage.total_tokens = 20 + response.output = [ + ResponseOutputMessage( + id="msg-1", + content=[ + ResponseOutputText( + annotations=[], + text="one", + type="output_text", + ), + ], + role="assistant", + status="completed", + type="message", + ), + ResponseOutputMessage( + id="msg-2", + content=[ + ResponseOutputText( + annotations=[], + text="two", + type="output_text", + ), + ResponseOutputText( + annotations=[], + text="three", + type="output_text", + ), + ], + role="assistant", + status="completed", + type="message", + ), + ] + input = [] + streaming_message_responses = None + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_responses_token_usage( + input, response, span, streaming_message_responses, count_tokens + ) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=None, + output_tokens=11, + output_tokens_reasoning=None, + total_tokens=20, + ) + + @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): sentry_init(