Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/spark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ datafusion-functions-nested = { workspace = true }
log = { workspace = true }
percent-encoding = "2.3.2"
rand = { workspace = true }
serde_json = { workspace = true }
serde_json = { workspace = true, features = ["raw_value"] }
sha1 = "0.10"
sha2 = { workspace = true }
url = { workspace = true }
Expand Down
102 changes: 92 additions & 10 deletions datafusion/spark/src/function/json/json_tuple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use std::collections::HashMap;
use std::sync::Arc;

use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray};
Expand All @@ -25,6 +26,7 @@ use datafusion_expr::{
ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
Volatility,
};
use serde_json::value::RawValue;

/// Spark-compatible `json_tuple` expression
///
Expand Down Expand Up @@ -134,8 +136,9 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe
}

let json_str = json_array.value(row_idx);
match serde_json::from_str::<serde_json::Value>(json_str) {
Ok(serde_json::Value::Object(map)) => {
// Parse into RawValue to preserve original text for numbers
match serde_json::from_str::<HashMap<String, Box<RawValue>>>(json_str) {
Ok(map) => {
null_buffer.append_non_null();
for (field_idx, builder) in builders.iter_mut().enumerate() {
if field_arrays[field_idx].is_null(row_idx) {
Expand All @@ -144,14 +147,32 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe
}
let field_name = field_arrays[field_idx].value(row_idx);
match map.get(field_name) {
Some(serde_json::Value::Null) => {
builder.append_null();
}
Some(serde_json::Value::String(s)) => {
builder.append_value(s);
}
Some(other) => {
builder.append_value(other.to_string());
Some(raw) => {
let raw_str = raw.get();
if raw_str == "null" {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if NULL?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With RawValue, values are not parsed — they stay as raw JSON text. So the JSON literal null arrives as the string "null" (not a Rust None). We compare against the raw text to detect it and emit a SQL NULL. This is the same behavior as the original serde_json::Value::Null branch, just expressed differently since we're working with raw text now.

builder.append_null();
} else if raw_str.starts_with('"') {
// String value: parse to unescape
match serde_json::from_str::<String>(raw_str) {
Ok(s) => builder.append_value(s),
Err(_) => builder.append_value(raw_str),
}
} else {
// Numbers, booleans, objects, arrays: raw text
// Spark uppercases exponent in numeric literals:
// 1.5e10 → 1.5E10
// Only apply to numbers (not booleans like "false")
Comment on lines +160 to +164
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation now returns the raw JSON token text not only for numbers, but also for booleans/objects/arrays (builder.append_value(raw_str)), whereas the previous serde_json::Value + to_string() path would have produced a canonicalized serialization (e.g., potentially different whitespace/key ordering). If the intended user-facing change is only number formatting/precision, consider keeping to_string() for non-number values or updating the PR description/docs to reflect the broader behavior change.

Copilot uses AI. Check for mistakes.
let first = raw_str.as_bytes().first();
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raw_str.as_bytes().first() returns Option<&u8>, but matches!(first, Some(b'0'..=b'9' | b'-')) is matching as if it were Option<u8>. This should fail to compile due to the &u8 vs u8 mismatch. Consider using .copied() (or otherwise dereferencing) before matching so the pattern works on Option<u8>.

Suggested change
let first = raw_str.as_bytes().first();
let first = raw_str.as_bytes().first().copied();

Copilot uses AI. Check for mistakes.
let is_number = matches!(first, Some(b'0'..=b'9' | b'-'));
if is_number && raw_str.contains('e') {
builder.append_value(raw_str.replace('e', "E"));
} else if is_number && raw_str == "-0" {
// Spark normalizes -0 to 0
builder.append_value("0");
} else {
builder.append_value(raw_str);
}
Comment on lines +160 to +174
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

json_tuple currently returns the raw text for -0 (i.e. "-0"), but the test comment notes Spark returns "0". If the goal is Spark compatibility, consider normalizing negative zero to "0" (and then assert exactly "0" in the unit test) so behavior is deterministic and matches Spark.

Copilot uses AI. Check for mistakes.
}
}
None => {
builder.append_null();
Expand Down Expand Up @@ -191,6 +212,7 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::StringArray;

#[test]
fn test_return_field_shape() {
Expand Down Expand Up @@ -219,6 +241,66 @@ mod tests {
}
}

/// Helper to run json_tuple with a single field and return the result string.
fn json_tuple_single(json: &str, field: &str) -> Option<String> {
let json_arr: ArrayRef = Arc::new(StringArray::from(vec![json]));
let field_arr: ArrayRef = Arc::new(StringArray::from(vec![field]));

let return_type =
DataType::Struct(vec![Field::new("c0", DataType::Utf8, true)].into());

let result = json_tuple_inner(&[json_arr, field_arr], &return_type).unwrap();
let struct_arr = result.as_any().downcast_ref::<StructArray>().unwrap();
let col = struct_arr.column(0);
let str_arr = col.as_any().downcast_ref::<StringArray>().unwrap();

if str_arr.is_null(0) {
None
} else {
Some(str_arr.value(0).to_string())
}
}

#[test]
fn test_number_scientific_notation() {
// Spark: json_tuple('{"v":1.5e10}', 'v') → '1.5E10'
assert_eq!(
json_tuple_single(r#"{"v":1.5e10}"#, "v"),
Some("1.5E10".to_string())
);
}

#[test]
fn test_number_large_integer() {
// Spark: json_tuple('{"v":99999999999999999999}', 'v') → '99999999999999999999'
assert_eq!(
json_tuple_single(r#"{"v":99999999999999999999}"#, "v"),
Some("99999999999999999999".to_string())
);
}

#[test]
fn test_number_negative_zero() {
// Spark: json_tuple('{"v":-0}', 'v') → '0'
assert_eq!(json_tuple_single(r#"{"v":-0}"#, "v"), Some("0".to_string()));
}

#[test]
fn test_number_normal_int() {
assert_eq!(
json_tuple_single(r#"{"v":42}"#, "v"),
Some("42".to_string())
);
}

#[test]
fn test_number_normal_float() {
assert_eq!(
json_tuple_single(r#"{"v":3.14}"#, "v"),
Some("3.14".to_string())
);
}

#[test]
fn test_too_few_args() {
let func = JsonTuple::new();
Expand Down
82 changes: 82 additions & 0 deletions datafusion/sqllogictest/test_files/spark/json/json_tuple.slt
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,88 @@ SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, '
----
{c0: Иван, c1: Москва}

# ── Additional edge cases ────────────────────────────────────

# Trailing comma in JSON is invalid → NULL
query ?
SELECT json_tuple('{"a":1,}'::STRING, 'a'::STRING);
----
NULL

# Empty string as key
query ?
SELECT json_tuple('{"":1}'::STRING, ''::STRING);
----
{c0: 1}

# "null" as key name (not JSON null, literal key)
query ?
SELECT json_tuple('{"null":1}'::STRING, 'null'::STRING);
----
{c0: 1}

# Interleaved existing and missing fields
query ?
SELECT json_tuple('{"a":1,"c":3}'::STRING, 'a'::STRING, 'b'::STRING, 'c'::STRING, 'd'::STRING);
----
{c0: 1, c1: NULL, c2: 3, c3: NULL}

# ── Number precision (raw text preservation) ─────────────────

# Scientific notation: Spark returns uppercase exponent
query ?
SELECT json_tuple('{"v":1.5e10}'::STRING, 'v'::STRING);
----
{c0: 1.5E10}

# Large integer: preserved without float conversion
query ?
SELECT json_tuple('{"v":99999999999999999999}'::STRING, 'v'::STRING);
----
{c0: 99999999999999999999}

# Normal integer
query ?
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mey be we need also tests for

-- Missing key
SELECT json_tuple('{"a":1}', 'b'); 
-- Tests behavior when requested key does not exist

-- Dot notation attempt
SELECT json_tuple('{"a":{"b":1}}', 'a.b'); 
-- Confirms dot notation is not supported

-- Deep nesting
SELECT json_tuple('{"a":{"b":{"c":{"d":1}}}}', 'a'); 
-- Tests extraction of nested object as string

-- Duplicate keys
SELECT json_tuple('{"a":1,"a":2}', 'a'); 
-- Tests duplicate key resolution (last wins)

-- Array field
SELECT json_tuple('{"a":[1,2,3]}', 'a'); 

I checked quickly in existing tests, seems those areas still uncovered

SELECT json_tuple('{"v":42}'::STRING, 'v'::STRING);
----
{c0: 42}

# Normal float
query ?
SELECT json_tuple('{"v":3.14}'::STRING, 'v'::STRING);
----
{c0: 3.14}

# Missing key returns NULL
query ?
SELECT json_tuple('{"a":1}'::STRING, 'b'::STRING);
----
{c0: NULL}

# Dot notation is not supported (treated as literal key)
query ?
SELECT json_tuple('{"a":{"b":1}}'::STRING, 'a.b'::STRING);
----
{c0: NULL}

# Deep nesting: nested object returned as raw string
query ?
SELECT json_tuple('{"a":{"b":{"c":{"d":1}}}}'::STRING, 'a'::STRING);
----
{c0: {"b":{"c":{"d":1}}}}

# Duplicate keys: last value wins
query ?
SELECT json_tuple('{"a":1,"a":2}'::STRING, 'a'::STRING);
----
{c0: 2}

# Array field: returned as raw string
query ?
SELECT json_tuple('{"a":[1,2,3]}'::STRING, 'a'::STRING);
----
{c0: [1,2,3]}

# Verify return type with arrow_typeof
query T
SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING));
Expand Down