apache · davidlghellin · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
@@ -59,7 +59,7 @@ datafusion-functions-nested = { workspace = true }
 log = { workspace = true }
 percent-encoding = "2.3.2"
 rand = { workspace = true }
-serde_json = { workspace = true }
+serde_json = { workspace = true, features = ["raw_value"] }
 sha1 = "0.10"
 sha2 = { workspace = true }
 url = { workspace = true }

diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray};
@@ -25,6 +26,7 @@ use datafusion_expr::{
     ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
 };
+use serde_json::value::RawValue;
 
 /// Spark-compatible `json_tuple` expression
 ///
@@ -134,8 +136,9 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe
         }
 
         let json_str = json_array.value(row_idx);
-        match serde_json::from_str::<serde_json::Value>(json_str) {
-            Ok(serde_json::Value::Object(map)) => {
+        // Parse into RawValue to preserve original text for numbers
+        match serde_json::from_str::<HashMap<String, Box<RawValue>>>(json_str) {
+            Ok(map) => {
                 null_buffer.append_non_null();
                 for (field_idx, builder) in builders.iter_mut().enumerate() {
                     if field_arrays[field_idx].is_null(row_idx) {
@@ -144,14 +147,32 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe
                     }
                     let field_name = field_arrays[field_idx].value(row_idx);
                     match map.get(field_name) {
-                        Some(serde_json::Value::Null) => {
-                            builder.append_null();
-                        }
-                        Some(serde_json::Value::String(s)) => {
-                            builder.append_value(s);
-                        }
-                        Some(other) => {
-                            builder.append_value(other.to_string());
+                        Some(raw) => {
+                            let raw_str = raw.get();
+                            if raw_str == "null" {
+                                builder.append_null();
+                            } else if raw_str.starts_with('"') {
+                                // String value: parse to unescape
+                                match serde_json::from_str::<String>(raw_str) {
+                                    Ok(s) => builder.append_value(s),
+                                    Err(_) => builder.append_value(raw_str),
+                                }
+                            } else {
+                                // Numbers, booleans, objects, arrays: raw text
+                                // Spark uppercases exponent in numeric literals:
+                                // 1.5e10 → 1.5E10
+                                // Only apply to numbers (not booleans like "false")
+                                let first = raw_str.as_bytes().first();
-                                let first = raw_str.as_bytes().first();
+                                let first = raw_str.as_bytes().first().copied();
-                                let first = raw_str.as_bytes().first();
+                                let first = raw_str.as_bytes().first().copied();
+                                let is_number = matches!(first, Some(b'0'..=b'9' | b'-'));
+                                if is_number && raw_str.contains('e') {
+                                    builder.append_value(raw_str.replace('e', "E"));
+                                } else if is_number && raw_str == "-0" {
+                                    // Spark normalizes -0 to 0
+                                    builder.append_value("0");
+                                } else {
+                                    builder.append_value(raw_str);
+                                }
+                            }
                         }
                         None => {
                             builder.append_null();
@@ -191,6 +212,7 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe
 #[cfg(test)]
 mod tests {
     use super::*;
+    use arrow::array::StringArray;
 
     #[test]
     fn test_return_field_shape() {
@@ -219,6 +241,66 @@ mod tests {
         }
     }
 
+    /// Helper to run json_tuple with a single field and return the result string.
+    fn json_tuple_single(json: &str, field: &str) -> Option<String> {
+        let json_arr: ArrayRef = Arc::new(StringArray::from(vec![json]));
+        let field_arr: ArrayRef = Arc::new(StringArray::from(vec![field]));
+
+        let return_type =
+            DataType::Struct(vec![Field::new("c0", DataType::Utf8, true)].into());
+
+        let result = json_tuple_inner(&[json_arr, field_arr], &return_type).unwrap();
+        let struct_arr = result.as_any().downcast_ref::<StructArray>().unwrap();
+        let col = struct_arr.column(0);
+        let str_arr = col.as_any().downcast_ref::<StringArray>().unwrap();
+
+        if str_arr.is_null(0) {
+            None
+        } else {
+            Some(str_arr.value(0).to_string())
+        }
+    }
+
+    #[test]
+    fn test_number_scientific_notation() {
+        // Spark: json_tuple('{"v":1.5e10}', 'v') → '1.5E10'
+        assert_eq!(
+            json_tuple_single(r#"{"v":1.5e10}"#, "v"),
+            Some("1.5E10".to_string())
+        );
+    }
+
+    #[test]
+    fn test_number_large_integer() {
+        // Spark: json_tuple('{"v":99999999999999999999}', 'v') → '99999999999999999999'
+        assert_eq!(
+            json_tuple_single(r#"{"v":99999999999999999999}"#, "v"),
+            Some("99999999999999999999".to_string())
+        );
+    }
+
+    #[test]
+    fn test_number_negative_zero() {
+        // Spark: json_tuple('{"v":-0}', 'v') → '0'
+        assert_eq!(json_tuple_single(r#"{"v":-0}"#, "v"), Some("0".to_string()));
+    }
+
+    #[test]
+    fn test_number_normal_int() {
+        assert_eq!(
+            json_tuple_single(r#"{"v":42}"#, "v"),
+            Some("42".to_string())
+        );
+    }
+
+    #[test]
+    fn test_number_normal_float() {
+        assert_eq!(
+            json_tuple_single(r#"{"v":3.14}"#, "v"),
+            Some("3.14".to_string())
+        );
+    }
+
     #[test]
     fn test_too_few_args() {
         let func = JsonTuple::new();

diff --git a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt
@@ -147,6 +147,88 @@ SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, '
 ----
 {c0: Иван, c1: Москва}
 
+# ── Additional edge cases ────────────────────────────────────
+
+# Trailing comma in JSON is invalid → NULL
+query ?
+SELECT json_tuple('{"a":1,}'::STRING, 'a'::STRING);
+----
+NULL
+
+# Empty string as key
+query ?
+SELECT json_tuple('{"":1}'::STRING, ''::STRING);
+----
+{c0: 1}
+
+# "null" as key name (not JSON null, literal key)
+query ?
+SELECT json_tuple('{"null":1}'::STRING, 'null'::STRING);
+----
+{c0: 1}
+
+# Interleaved existing and missing fields
+query ?
+SELECT json_tuple('{"a":1,"c":3}'::STRING, 'a'::STRING, 'b'::STRING, 'c'::STRING, 'd'::STRING);
+----
+{c0: 1, c1: NULL, c2: 3, c3: NULL}
+
+# ── Number precision (raw text preservation) ─────────────────
+
+# Scientific notation: Spark returns uppercase exponent
+query ?
+SELECT json_tuple('{"v":1.5e10}'::STRING, 'v'::STRING);
+----
+{c0: 1.5E10}
+
+# Large integer: preserved without float conversion
+query ?
+SELECT json_tuple('{"v":99999999999999999999}'::STRING, 'v'::STRING);
+----
+{c0: 99999999999999999999}
+
+# Normal integer
+query ?
+SELECT json_tuple('{"v":42}'::STRING, 'v'::STRING);
+----
+{c0: 42}
+
+# Normal float
+query ?
+SELECT json_tuple('{"v":3.14}'::STRING, 'v'::STRING);
+----
+{c0: 3.14}
+
+# Missing key returns NULL
+query ?
+SELECT json_tuple('{"a":1}'::STRING, 'b'::STRING);
+----
+{c0: NULL}
+
+# Dot notation is not supported (treated as literal key)
+query ?
+SELECT json_tuple('{"a":{"b":1}}'::STRING, 'a.b'::STRING);
+----
+{c0: NULL}
+
+# Deep nesting: nested object returned as raw string
+query ?
+SELECT json_tuple('{"a":{"b":{"c":{"d":1}}}}'::STRING, 'a'::STRING);
+----
+{c0: {"b":{"c":{"d":1}}}}
+
+# Duplicate keys: last value wins
+query ?
+SELECT json_tuple('{"a":1,"a":2}'::STRING, 'a'::STRING);
+----
+{c0: 2}
+
+# Array field: returned as raw string
+query ?
+SELECT json_tuple('{"a":[1,2,3]}'::STRING, 'a'::STRING);
+----
+{c0: [1,2,3]}
+
 # Verify return type with arrow_typeof
 query T
 SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING));