From 59a3ff3c1a34c659d0fe92e6c67182bef6ca7cd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Mon, 30 Mar 2026 22:53:48 +0200 Subject: [PATCH 1/4] fix(spark): preserve raw number text in json_tuple to match Spark behavior --- datafusion/spark/Cargo.toml | 2 +- .../spark/src/function/json/json_tuple.rs | 102 ++++++++++++++++-- .../test_files/spark/json/json_tuple.slt | 52 +++++++++ 3 files changed, 145 insertions(+), 11 deletions(-) diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index 162b6d814e804..a74196967e2cb 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -59,7 +59,7 @@ datafusion-functions-nested = { workspace = true } log = { workspace = true } percent-encoding = "2.3.2" rand = { workspace = true } -serde_json = { workspace = true } +serde_json = { workspace = true, features = ["raw_value"] } sha1 = "0.10" sha2 = { workspace = true } url = { workspace = true } diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs index 3496f979ffe06..044dcd2854fc2 100644 --- a/datafusion/spark/src/function/json/json_tuple.rs +++ b/datafusion/spark/src/function/json/json_tuple.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; use std::sync::Arc; use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray}; @@ -25,6 +26,7 @@ use datafusion_expr::{ ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; +use serde_json::value::RawValue; /// Spark-compatible `json_tuple` expression /// @@ -134,8 +136,9 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result(json_str) { - Ok(serde_json::Value::Object(map)) => { + // Parse into RawValue to preserve original text for numbers + match serde_json::from_str::>>(json_str) { + Ok(map) => { null_buffer.append_non_null(); for (field_idx, builder) in builders.iter_mut().enumerate() { if field_arrays[field_idx].is_null(row_idx) { @@ -144,14 +147,25 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result { - builder.append_null(); - } - Some(serde_json::Value::String(s)) => { - builder.append_value(s); - } - Some(other) => { - builder.append_value(other.to_string()); + Some(raw) => { + let raw_str = raw.get(); + if raw_str == "null" { + builder.append_null(); + } else if raw_str.starts_with('"') { + // String value: parse to unescape + match serde_json::from_str::(raw_str) { + Ok(s) => builder.append_value(s), + Err(_) => builder.append_value(raw_str), + } + } else { + // Numbers, booleans: use raw text as-is + // Spark uppercases exponent: 1.5e10 → 1.5E10 + if raw_str.contains('e') { + builder.append_value(raw_str.replace('e', "E")); + } else { + builder.append_value(raw_str); + } + } } None => { builder.append_null(); @@ -191,6 +205,7 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result Option { + let json_arr: ArrayRef = Arc::new(StringArray::from(vec![json])); + let field_arr: ArrayRef = Arc::new(StringArray::from(vec![field])); + + let return_type = + DataType::Struct(vec![Field::new("c0", DataType::Utf8, true)].into()); + + let result = json_tuple_inner(&[json_arr, field_arr], &return_type).unwrap(); + let struct_arr = result.as_any().downcast_ref::().unwrap(); + let col = struct_arr.column(0); + let str_arr = col.as_any().downcast_ref::().unwrap(); + + if str_arr.is_null(0) { + None + } else { + Some(str_arr.value(0).to_string()) + } + } + + #[test] + fn test_number_scientific_notation() { + // Spark: json_tuple('{"v":1.5e10}', 'v') → '1.5E10' + assert_eq!( + json_tuple_single(r#"{"v":1.5e10}"#, "v"), + Some("1.5E10".to_string()) + ); + } + + #[test] + fn test_number_large_integer() { + // Spark: json_tuple('{"v":99999999999999999999}', 'v') → '99999999999999999999' + assert_eq!( + json_tuple_single(r#"{"v":99999999999999999999}"#, "v"), + Some("99999999999999999999".to_string()) + ); + } + + #[test] + fn test_number_negative_zero() { + // Spark: json_tuple('{"v":-0}', 'v') → '0' + // RawValue preserves '-0', but Spark returns '0' + // This is acceptable — both are valid representations + let result = json_tuple_single(r#"{"v":-0}"#, "v"); + assert!( + result == Some("-0".to_string()) || result == Some("0".to_string()), + "expected '-0' or '0', got {:?}", + result + ); + } + + #[test] + fn test_number_normal_int() { + assert_eq!( + json_tuple_single(r#"{"v":42}"#, "v"), + Some("42".to_string()) + ); + } + + #[test] + fn test_number_normal_float() { + assert_eq!( + json_tuple_single(r#"{"v":3.14}"#, "v"), + Some("3.14".to_string()) + ); + } + #[test] fn test_too_few_args() { let func = JsonTuple::new(); diff --git a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt index c0c424946709f..146c4c174c613 100644 --- a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt +++ b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt @@ -147,6 +147,58 @@ SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, ' ---- {c0: Иван, c1: Москва} +# ── Additional edge cases ──────────────────────────────────── + +# Trailing comma in JSON is invalid → NULL +query ? +SELECT json_tuple('{"a":1,}'::STRING, 'a'::STRING); +---- +NULL + +# Empty string as key +query ? +SELECT json_tuple('{"":1}'::STRING, ''::STRING); +---- +{c0: 1} + +# "null" as key name (not JSON null, literal key) +query ? +SELECT json_tuple('{"null":1}'::STRING, 'null'::STRING); +---- +{c0: 1} + +# Interleaved existing and missing fields +query ? +SELECT json_tuple('{"a":1,"c":3}'::STRING, 'a'::STRING, 'b'::STRING, 'c'::STRING, 'd'::STRING); +---- +{c0: 1, c1: NULL, c2: 3, c3: NULL} + +# ── Number precision (raw text preservation) ───────────────── + +# Scientific notation: Spark returns uppercase exponent +query ? +SELECT json_tuple('{"v":1.5e10}'::STRING, 'v'::STRING); +---- +{c0: 1.5E10} + +# Large integer: preserved without float conversion +query ? +SELECT json_tuple('{"v":99999999999999999999}'::STRING, 'v'::STRING); +---- +{c0: 99999999999999999999} + +# Normal integer +query ? +SELECT json_tuple('{"v":42}'::STRING, 'v'::STRING); +---- +{c0: 42} + +# Normal float +query ? +SELECT json_tuple('{"v":3.14}'::STRING, 'v'::STRING); +---- +{c0: 3.14} + # Verify return type with arrow_typeof query T SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING)); From 618de1527753a6b38a66b6174dbc456b6adc1906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Mon, 30 Mar 2026 23:08:12 +0200 Subject: [PATCH 2/4] fix fmt --- datafusion/spark/src/function/json/json_tuple.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs index 044dcd2854fc2..85a0a53be530d 100644 --- a/datafusion/spark/src/function/json/json_tuple.rs +++ b/datafusion/spark/src/function/json/json_tuple.rs @@ -280,8 +280,7 @@ mod tests { let result = json_tuple_single(r#"{"v":-0}"#, "v"); assert!( result == Some("-0".to_string()) || result == Some("0".to_string()), - "expected '-0' or '0', got {:?}", - result + "expected '-0' or '0', got {result:?}" ); } From 0e457168da505d7443d682b3a7036e2a4f8905e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Mon, 30 Mar 2026 23:40:28 +0200 Subject: [PATCH 3/4] fix copilot --- .../spark/src/function/json/json_tuple.rs | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs index 85a0a53be530d..7ff831d1515d2 100644 --- a/datafusion/spark/src/function/json/json_tuple.rs +++ b/datafusion/spark/src/function/json/json_tuple.rs @@ -158,10 +158,17 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result builder.append_value(raw_str), } } else { - // Numbers, booleans: use raw text as-is - // Spark uppercases exponent: 1.5e10 → 1.5E10 - if raw_str.contains('e') { + // Numbers, booleans, objects, arrays: raw text + // Spark uppercases exponent in numeric literals: + // 1.5e10 → 1.5E10 + // Only apply to numbers (not booleans like "false") + let first = raw_str.as_bytes().first(); + let is_number = matches!(first, Some(b'0'..=b'9' | b'-')); + if is_number && raw_str.contains('e') { builder.append_value(raw_str.replace('e', "E")); + } else if is_number && raw_str == "-0" { + // Spark normalizes -0 to 0 + builder.append_value("0"); } else { builder.append_value(raw_str); } @@ -275,13 +282,7 @@ mod tests { #[test] fn test_number_negative_zero() { // Spark: json_tuple('{"v":-0}', 'v') → '0' - // RawValue preserves '-0', but Spark returns '0' - // This is acceptable — both are valid representations - let result = json_tuple_single(r#"{"v":-0}"#, "v"); - assert!( - result == Some("-0".to_string()) || result == Some("0".to_string()), - "expected '-0' or '0', got {result:?}" - ); + assert_eq!(json_tuple_single(r#"{"v":-0}"#, "v"), Some("0".to_string())); } #[test] From 5c9a63d8cb5af7904c8b6a01a889360267f6bdc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Fri, 3 Apr 2026 10:33:02 +0200 Subject: [PATCH 4/4] test --- .../test_files/spark/json/json_tuple.slt | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt index 146c4c174c613..a454f14d5fb65 100644 --- a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt +++ b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt @@ -199,6 +199,36 @@ SELECT json_tuple('{"v":3.14}'::STRING, 'v'::STRING); ---- {c0: 3.14} +# Missing key returns NULL +query ? +SELECT json_tuple('{"a":1}'::STRING, 'b'::STRING); +---- +{c0: NULL} + +# Dot notation is not supported (treated as literal key) +query ? +SELECT json_tuple('{"a":{"b":1}}'::STRING, 'a.b'::STRING); +---- +{c0: NULL} + +# Deep nesting: nested object returned as raw string +query ? +SELECT json_tuple('{"a":{"b":{"c":{"d":1}}}}'::STRING, 'a'::STRING); +---- +{c0: {"b":{"c":{"d":1}}}} + +# Duplicate keys: last value wins +query ? +SELECT json_tuple('{"a":1,"a":2}'::STRING, 'a'::STRING); +---- +{c0: 2} + +# Array field: returned as raw string +query ? +SELECT json_tuple('{"a":[1,2,3]}'::STRING, 'a'::STRING); +---- +{c0: [1,2,3]} + # Verify return type with arrow_typeof query T SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING));