-
Notifications
You must be signed in to change notification settings - Fork 2.1k
fix(spark): preserve raw number text in json_tuple to match Spark
#21264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
59a3ff3
618de15
0e45716
d24932f
5c9a63d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |||||
| // specific language governing permissions and limitations | ||||||
| // under the License. | ||||||
|
|
||||||
| use std::collections::HashMap; | ||||||
| use std::sync::Arc; | ||||||
|
|
||||||
| use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray}; | ||||||
|
|
@@ -25,6 +26,7 @@ use datafusion_expr::{ | |||||
| ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, | ||||||
| Volatility, | ||||||
| }; | ||||||
| use serde_json::value::RawValue; | ||||||
|
|
||||||
| /// Spark-compatible `json_tuple` expression | ||||||
| /// | ||||||
|
|
@@ -134,8 +136,9 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe | |||||
| } | ||||||
|
|
||||||
| let json_str = json_array.value(row_idx); | ||||||
| match serde_json::from_str::<serde_json::Value>(json_str) { | ||||||
| Ok(serde_json::Value::Object(map)) => { | ||||||
| // Parse into RawValue to preserve original text for numbers | ||||||
| match serde_json::from_str::<HashMap<String, Box<RawValue>>>(json_str) { | ||||||
| Ok(map) => { | ||||||
| null_buffer.append_non_null(); | ||||||
| for (field_idx, builder) in builders.iter_mut().enumerate() { | ||||||
| if field_arrays[field_idx].is_null(row_idx) { | ||||||
|
|
@@ -144,14 +147,32 @@ fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRe | |||||
| } | ||||||
| let field_name = field_arrays[field_idx].value(row_idx); | ||||||
| match map.get(field_name) { | ||||||
| Some(serde_json::Value::Null) => { | ||||||
| builder.append_null(); | ||||||
| } | ||||||
| Some(serde_json::Value::String(s)) => { | ||||||
| builder.append_value(s); | ||||||
| } | ||||||
| Some(other) => { | ||||||
| builder.append_value(other.to_string()); | ||||||
| Some(raw) => { | ||||||
| let raw_str = raw.get(); | ||||||
| if raw_str == "null" { | ||||||
| builder.append_null(); | ||||||
| } else if raw_str.starts_with('"') { | ||||||
| // String value: parse to unescape | ||||||
| match serde_json::from_str::<String>(raw_str) { | ||||||
| Ok(s) => builder.append_value(s), | ||||||
| Err(_) => builder.append_value(raw_str), | ||||||
| } | ||||||
| } else { | ||||||
| // Numbers, booleans, objects, arrays: raw text | ||||||
| // Spark uppercases exponent in numeric literals: | ||||||
| // 1.5e10 → 1.5E10 | ||||||
| // Only apply to numbers (not booleans like "false") | ||||||
|
Comment on lines
+160
to
+164
|
||||||
| let first = raw_str.as_bytes().first(); | ||||||
|
||||||
| let first = raw_str.as_bytes().first(); | |
| let first = raw_str.as_bytes().first().copied(); |
Copilot
AI
Mar 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
json_tuple currently returns the raw text for -0 (i.e. "-0"), but the test comment notes Spark returns "0". If the goal is Spark compatibility, consider normalizing negative zero to "0" (and then assert exactly "0" in the unit test) so behavior is deterministic and matches Spark.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -147,6 +147,88 @@ SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, ' | |
| ---- | ||
| {c0: Иван, c1: Москва} | ||
|
|
||
| # ── Additional edge cases ──────────────────────────────────── | ||
|
|
||
| # Trailing comma in JSON is invalid → NULL | ||
| query ? | ||
| SELECT json_tuple('{"a":1,}'::STRING, 'a'::STRING); | ||
| ---- | ||
| NULL | ||
|
|
||
| # Empty string as key | ||
| query ? | ||
| SELECT json_tuple('{"":1}'::STRING, ''::STRING); | ||
| ---- | ||
| {c0: 1} | ||
|
|
||
| # "null" as key name (not JSON null, literal key) | ||
| query ? | ||
| SELECT json_tuple('{"null":1}'::STRING, 'null'::STRING); | ||
| ---- | ||
| {c0: 1} | ||
|
|
||
| # Interleaved existing and missing fields | ||
| query ? | ||
| SELECT json_tuple('{"a":1,"c":3}'::STRING, 'a'::STRING, 'b'::STRING, 'c'::STRING, 'd'::STRING); | ||
| ---- | ||
| {c0: 1, c1: NULL, c2: 3, c3: NULL} | ||
|
|
||
| # ── Number precision (raw text preservation) ───────────────── | ||
|
|
||
| # Scientific notation: Spark returns uppercase exponent | ||
| query ? | ||
| SELECT json_tuple('{"v":1.5e10}'::STRING, 'v'::STRING); | ||
| ---- | ||
| {c0: 1.5E10} | ||
|
|
||
| # Large integer: preserved without float conversion | ||
| query ? | ||
| SELECT json_tuple('{"v":99999999999999999999}'::STRING, 'v'::STRING); | ||
| ---- | ||
| {c0: 99999999999999999999} | ||
|
|
||
| # Normal integer | ||
| query ? | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mey be we need also tests for I checked quickly in existing tests, seems those areas still uncovered |
||
| SELECT json_tuple('{"v":42}'::STRING, 'v'::STRING); | ||
| ---- | ||
| {c0: 42} | ||
|
|
||
| # Normal float | ||
| query ? | ||
| SELECT json_tuple('{"v":3.14}'::STRING, 'v'::STRING); | ||
| ---- | ||
| {c0: 3.14} | ||
|
|
||
| # Missing key returns NULL | ||
| query ? | ||
| SELECT json_tuple('{"a":1}'::STRING, 'b'::STRING); | ||
| ---- | ||
| {c0: NULL} | ||
|
|
||
| # Dot notation is not supported (treated as literal key) | ||
| query ? | ||
| SELECT json_tuple('{"a":{"b":1}}'::STRING, 'a.b'::STRING); | ||
| ---- | ||
| {c0: NULL} | ||
|
|
||
| # Deep nesting: nested object returned as raw string | ||
| query ? | ||
| SELECT json_tuple('{"a":{"b":{"c":{"d":1}}}}'::STRING, 'a'::STRING); | ||
| ---- | ||
| {c0: {"b":{"c":{"d":1}}}} | ||
|
|
||
| # Duplicate keys: last value wins | ||
| query ? | ||
| SELECT json_tuple('{"a":1,"a":2}'::STRING, 'a'::STRING); | ||
| ---- | ||
| {c0: 2} | ||
|
|
||
| # Array field: returned as raw string | ||
| query ? | ||
| SELECT json_tuple('{"a":[1,2,3]}'::STRING, 'a'::STRING); | ||
| ---- | ||
| {c0: [1,2,3]} | ||
|
|
||
| # Verify return type with arrow_typeof | ||
| query T | ||
| SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what if NULL?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With RawValue, values are not parsed — they stay as raw JSON text. So the JSON literal null arrives as the string "null" (not a Rust None). We compare against the raw text to detect it and emit a SQL NULL. This is the same behavior as the original serde_json::Value::Null branch, just expressed differently since we're working with raw text now.