diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index 162b6d814e804..a74196967e2cb 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -59,7 +59,7 @@ datafusion-functions-nested = { workspace = true } log = { workspace = true } percent-encoding = "2.3.2" rand = { workspace = true } -serde_json = { workspace = true } +serde_json = { workspace = true, features = ["raw_value"] } sha1 = "0.10" sha2 = { workspace = true } url = { workspace = true } diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs index 3496f979ffe06..7ff831d1515d2 100644 --- a/datafusion/spark/src/function/json/json_tuple.rs +++ b/datafusion/spark/src/function/json/json_tuple.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; use std::sync::Arc; use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray}; @@ -25,6 +26,7 @@ use datafusion_expr::{ ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; +use serde_json::value::RawValue; /// Spark-compatible `json_tuple` expression /// @@ -134,8 +136,9 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result(json_str) { - Ok(serde_json::Value::Object(map)) => { + // Parse into RawValue to preserve original text for numbers + match serde_json::from_str::>>(json_str) { + Ok(map) => { null_buffer.append_non_null(); for (field_idx, builder) in builders.iter_mut().enumerate() { if field_arrays[field_idx].is_null(row_idx) { @@ -144,14 +147,32 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result { - builder.append_null(); - } - Some(serde_json::Value::String(s)) => { - builder.append_value(s); - } - Some(other) => { - builder.append_value(other.to_string()); + Some(raw) => { + let raw_str = raw.get(); + if raw_str == "null" { + builder.append_null(); + } else if raw_str.starts_with('"') { + // String value: parse to unescape + match serde_json::from_str::(raw_str) { + Ok(s) => builder.append_value(s), + Err(_) => builder.append_value(raw_str), + } + } else { + // Numbers, booleans, objects, arrays: raw text + // Spark uppercases exponent in numeric literals: + // 1.5e10 → 1.5E10 + // Only apply to numbers (not booleans like "false") + let first = raw_str.as_bytes().first(); + let is_number = matches!(first, Some(b'0'..=b'9' | b'-')); + if is_number && raw_str.contains('e') { + builder.append_value(raw_str.replace('e', "E")); + } else if is_number && raw_str == "-0" { + // Spark normalizes -0 to 0 + builder.append_value("0"); + } else { + builder.append_value(raw_str); + } + } } None => { builder.append_null(); @@ -191,6 +212,7 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result Option { + let json_arr: ArrayRef = Arc::new(StringArray::from(vec![json])); + let field_arr: ArrayRef = Arc::new(StringArray::from(vec![field])); + + let return_type = + DataType::Struct(vec![Field::new("c0", DataType::Utf8, true)].into()); + + let result = json_tuple_inner(&[json_arr, field_arr], &return_type).unwrap(); + let struct_arr = result.as_any().downcast_ref::().unwrap(); + let col = struct_arr.column(0); + let str_arr = col.as_any().downcast_ref::().unwrap(); + + if str_arr.is_null(0) { + None + } else { + Some(str_arr.value(0).to_string()) + } + } + + #[test] + fn test_number_scientific_notation() { + // Spark: json_tuple('{"v":1.5e10}', 'v') → '1.5E10' + assert_eq!( + json_tuple_single(r#"{"v":1.5e10}"#, "v"), + Some("1.5E10".to_string()) + ); + } + + #[test] + fn test_number_large_integer() { + // Spark: json_tuple('{"v":99999999999999999999}', 'v') → '99999999999999999999' + assert_eq!( + json_tuple_single(r#"{"v":99999999999999999999}"#, "v"), + Some("99999999999999999999".to_string()) + ); + } + + #[test] + fn test_number_negative_zero() { + // Spark: json_tuple('{"v":-0}', 'v') → '0' + assert_eq!(json_tuple_single(r#"{"v":-0}"#, "v"), Some("0".to_string())); + } + + #[test] + fn test_number_normal_int() { + assert_eq!( + json_tuple_single(r#"{"v":42}"#, "v"), + Some("42".to_string()) + ); + } + + #[test] + fn test_number_normal_float() { + assert_eq!( + json_tuple_single(r#"{"v":3.14}"#, "v"), + Some("3.14".to_string()) + ); + } + #[test] fn test_too_few_args() { let func = JsonTuple::new(); diff --git a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt index c0c424946709f..a454f14d5fb65 100644 --- a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt +++ b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt @@ -147,6 +147,88 @@ SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, ' ---- {c0: Иван, c1: Москва} +# ── Additional edge cases ──────────────────────────────────── + +# Trailing comma in JSON is invalid → NULL +query ? +SELECT json_tuple('{"a":1,}'::STRING, 'a'::STRING); +---- +NULL + +# Empty string as key +query ? +SELECT json_tuple('{"":1}'::STRING, ''::STRING); +---- +{c0: 1} + +# "null" as key name (not JSON null, literal key) +query ? +SELECT json_tuple('{"null":1}'::STRING, 'null'::STRING); +---- +{c0: 1} + +# Interleaved existing and missing fields +query ? +SELECT json_tuple('{"a":1,"c":3}'::STRING, 'a'::STRING, 'b'::STRING, 'c'::STRING, 'd'::STRING); +---- +{c0: 1, c1: NULL, c2: 3, c3: NULL} + +# ── Number precision (raw text preservation) ───────────────── + +# Scientific notation: Spark returns uppercase exponent +query ? +SELECT json_tuple('{"v":1.5e10}'::STRING, 'v'::STRING); +---- +{c0: 1.5E10} + +# Large integer: preserved without float conversion +query ? +SELECT json_tuple('{"v":99999999999999999999}'::STRING, 'v'::STRING); +---- +{c0: 99999999999999999999} + +# Normal integer +query ? +SELECT json_tuple('{"v":42}'::STRING, 'v'::STRING); +---- +{c0: 42} + +# Normal float +query ? +SELECT json_tuple('{"v":3.14}'::STRING, 'v'::STRING); +---- +{c0: 3.14} + +# Missing key returns NULL +query ? +SELECT json_tuple('{"a":1}'::STRING, 'b'::STRING); +---- +{c0: NULL} + +# Dot notation is not supported (treated as literal key) +query ? +SELECT json_tuple('{"a":{"b":1}}'::STRING, 'a.b'::STRING); +---- +{c0: NULL} + +# Deep nesting: nested object returned as raw string +query ? +SELECT json_tuple('{"a":{"b":{"c":{"d":1}}}}'::STRING, 'a'::STRING); +---- +{c0: {"b":{"c":{"d":1}}}} + +# Duplicate keys: last value wins +query ? +SELECT json_tuple('{"a":1,"a":2}'::STRING, 'a'::STRING); +---- +{c0: 2} + +# Array field: returned as raw string +query ? +SELECT json_tuple('{"a":[1,2,3]}'::STRING, 'a'::STRING); +---- +{c0: [1,2,3]} + # Verify return type with arrow_typeof query T SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING));