diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000000..4b81e3fef4 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,5 @@ +# prettier re-aligns markdown table columns to the widest cell, so adding a +# single expression row rewrites every row in the table. That produces noisy +# diffs and frequent merge conflicts between PRs that each add new expressions. +# This file is almost entirely tables, so exempt it from prettier formatting. +docs/source/user-guide/latest/expressions.md diff --git a/dev/generate-release-docs.sh b/dev/generate-release-docs.sh index 51b61db0a3..27b7829429 100755 --- a/dev/generate-release-docs.sh +++ b/dev/generate-release-docs.sh @@ -42,7 +42,8 @@ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" cd "${PROJECT_ROOT}" echo "Compiling and generating documentation content..." -./mvnw package -Pgenerate-docs -DskipTests -Dmaven.test.skip=true +# Use the newest Spark profile so the expression reference enumerates the full FunctionRegistry. +./mvnw package -Pgenerate-docs -Pspark-4.1 -DskipTests -Dmaven.test.skip=true echo "" echo "Done! Generated documentation content in docs/source/user-guide/latest/" diff --git a/docs/build.sh b/docs/build.sh index d5618bc138..fbbff6cd8a 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -34,7 +34,8 @@ python3 generate-versions.py # This runs GenerateDocs against the temp copy, not source files echo "Generating dynamic documentation content..." cd .. -./mvnw -q package -Pgenerate-docs -DskipTests -Dmaven.test.skip=true \ +# Use the newest Spark profile so the expression reference enumerates the full FunctionRegistry. +./mvnw -q package -Pgenerate-docs -Pspark-4.1 -DskipTests -Dmaven.test.skip=true \ -Dexec.arguments="$(pwd)/docs/temp/user-guide/latest/" cd docs diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 3627766445..d593861c01 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -25,8 +25,6 @@ dispatching to a Spark-compatible codegen path. When an expression is not suppor transparently falls back to Spark for that part of the plan; results are unaffected. Expressions marked ✅ Supported are enabled by default and produce Spark-compatible results. -Expressions marked ⚠️ Incorrect by default run natively by default but can return results that -differ from Spark on some inputs; see the linked detail on each affected row. Some ✅ Supported expressions have specific incompatible cases that fall back to Spark by default. Those cases must be opted into per expression with @@ -39,12 +37,11 @@ Most expressions can also be disabled with `spark.comet.expression.EXPRNAME.enab ## Status legend -| Status | Meaning | -| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| ✅ Supported | Comet produces Spark-compatible results by default. Some inputs or forms may fall back to Spark, and any incompatible behavior is opt-in (off by default). | -| ⚠️ Incorrect by default | Comet runs natively by default but can return results that differ from Spark (a wrong value, or a native error on valid input). See the linked detail on each row. | -| 🔜 Planned | Intended; tracked by an open issue or pull request. | -| 💤 Not currently planned | Not on the current roadmap; falls back to Spark and may be reconsidered later. | +| Status | Meaning | +| --- | --- | +| ✅ Supported | Comet produces Spark-compatible results by default. Some inputs or forms may fall back to Spark, and any incompatible behavior is opt-in (off by default). | +| 🔜 Planned | Intended; tracked by an open issue or pull request. | +| 💤 Not currently planned | Not on the current roadmap; falls back to Spark and may be reconsidered later. | ## Not currently planned @@ -67,146 +64,36 @@ The tables below list every Spark built-in expression with its current status. ## agg_funcs -| Function | Status | Notes | -| ----------------------- | ------ | -------------------------------------------------------------------------------------------------------------------------- | -| `any` | ✅ | | -| `any_value` | ✅ | | -| `approx_count_distinct` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `array_agg` | 🔜 | Array aggregate (related to `collect_list`, [#2524](https://github.com/apache/datafusion-comet/issues/2524)) | -| `avg` | ✅ | Interval types fall back | -| `bit_and` | ✅ | | -| `bit_or` | ✅ | | -| `bit_xor` | ✅ | | -| `bool_and` | ✅ | | -| `bool_or` | ✅ | | -| `collect_list` | 🔜 | [#2524](https://github.com/apache/datafusion-comet/issues/2524) | -| `collect_set` | ✅ | | -| `corr` | ✅ | | -| `count` | ✅ | | -| `count_if` | ✅ | | -| `covar_pop` | ✅ | | -| `covar_samp` | ✅ | | -| `every` | ✅ | | -| `first` | ✅ | | -| `first_value` | ✅ | | -| `grouping` | 🔜 | Grouping indicator for ROLLUP/CUBE/GROUPING SETS | -| `grouping_id` | 🔜 | Grouping indicator for ROLLUP/CUBE/GROUPING SETS | -| `kurtosis` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `last` | ✅ | | -| `last_value` | ✅ | | -| `listagg` | 🔜 | String aggregation | -| `max` | ✅ | | -| `max_by` | 🔜 | [#3841](https://github.com/apache/datafusion-comet/issues/3841) | -| `mean` | ✅ | | -| `median` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `min` | ✅ | | -| `min_by` | 🔜 | [#3841](https://github.com/apache/datafusion-comet/issues/3841) | -| `mode` | 🔜 | [#3970](https://github.com/apache/datafusion-comet/issues/3970) | -| `percentile` | 🔜 | [#4542](https://github.com/apache/datafusion-comet/issues/4542) | -| `percentile_cont` | 🔜 | Percentile aggregate | -| `percentile_disc` | 🔜 | Percentile aggregate | -| `regr_avgx` | ✅ | Native: Spark rewrites to `Average` (tests in [#4551](https://github.com/apache/datafusion-comet/issues/4551)) | -| `regr_avgy` | ✅ | Native: Spark rewrites to `Average` (tests in [#4551](https://github.com/apache/datafusion-comet/issues/4551)) | -| `regr_count` | ✅ | Native: Spark rewrites to `Count` (tests in [#4551](https://github.com/apache/datafusion-comet/issues/4551)) | -| `regr_intercept` | 🔜 | Falls back; can reuse `covar_pop`/`var_pop` accumulators ([#4552](https://github.com/apache/datafusion-comet/issues/4552)) | -| `regr_r2` | 🔜 | Falls back; can reuse the `corr` accumulator ([#4552](https://github.com/apache/datafusion-comet/issues/4552)) | -| `regr_slope` | 🔜 | Falls back; can reuse `covar_pop`/`var_pop` accumulators ([#4552](https://github.com/apache/datafusion-comet/issues/4552)) | -| `regr_sxx` | 🔜 | Falls back; can reuse `var_pop` accumulator ([#4552](https://github.com/apache/datafusion-comet/issues/4552)) | -| `regr_sxy` | 🔜 | Falls back; can reuse `covar_pop` accumulator ([#4552](https://github.com/apache/datafusion-comet/issues/4552)) | -| `regr_syy` | 🔜 | Falls back; can reuse `var_pop` accumulator ([#4552](https://github.com/apache/datafusion-comet/issues/4552)) | -| `skewness` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `some` | ✅ | | -| `std` | ✅ | | -| `stddev` | ✅ | | -| `stddev_pop` | ✅ | | -| `stddev_samp` | ✅ | | -| `string_agg` | 🔜 | String aggregation (alias of `listagg`) | -| `sum` | ✅ | | -| `try_avg` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `try_sum` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `var_pop` | ✅ | | -| `var_samp` | ✅ | | -| `variance` | ✅ | | + + --- ## array_funcs -| Function | Status | Notes | -| ----------------- | ------ | ----------------------------------------------------------------------------------- | -| `array` | ✅ | | -| `array_append` | ✅ | | -| `array_compact` | ✅ | | -| `array_contains` | ✅ | NaN/signed-zero handling may differ ([details](compatibility/floating-point.md)) | -| `array_distinct` | ✅ | NaN/signed-zero handling may differ ([details](compatibility/floating-point.md)) | -| `array_except` | ✅ | Incompatible; falls back by default ([details](compatibility/expressions/array.md)) | -| `array_insert` | ✅ | | -| `array_intersect` | ✅ | Incompatible; falls back by default ([details](compatibility/expressions/array.md)) | -| `array_join` | ✅ | Incompatible; falls back by default ([details](compatibility/expressions/array.md)) | -| `array_max` | ✅ | NaN ordering may differ ([details](compatibility/floating-point.md)) | -| `array_min` | ✅ | NaN ordering may differ ([details](compatibility/floating-point.md)) | -| `array_position` | ✅ | Binary/struct/map/null elements fall back | -| `array_prepend` | 🔜 | Sibling of `array_append` | -| `array_remove` | ✅ | | -| `array_repeat` | ✅ | | -| `array_union` | ✅ | NaN/signed-zero handling may differ ([details](compatibility/floating-point.md)) | -| `arrays_overlap` | ✅ | | -| `arrays_zip` | ✅ | | -| `element_at` | ✅ | MapType input falls back | -| `flatten` | ✅ | Binary/struct/map elements fall back | -| `get` | ✅ | | -| `sequence` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `shuffle` | 🔜 | Random array shuffle | -| `slice` | ✅ | Native ([#4149](https://github.com/apache/datafusion-comet/issues/4149)) | -| `sort_array` | ✅ | Nested struct/null arrays fall back | + + --- ## bitwise_funcs -| Function | Status | Notes | -| -------------------- | ------ | ---------------------------------------------------- | -| `&` | ✅ | | -| `<<` | ✅ | | -| `>>` | ✅ | | -| `>>>` | ✅ | Operator alias for `shiftrightunsigned` (Spark 4.0+) | -| `^` | ✅ | | -| `bit_count` | ✅ | | -| `bit_get` | ✅ | | -| `getbit` | ✅ | | -| `shiftright` | ✅ | | -| `shiftrightunsigned` | ✅ | | -| `\|` | ✅ | | -| `~` | ✅ | | + + --- ## collection_funcs -| Function | Status | Notes | -| ------------- | ------ | ---------------------------------------------------------------------------------------------- | -| `array_size` | ✅ | | -| `cardinality` | ✅ | MapType input falls back | -| `concat` | ✅ | Binary/array children fall back | -| `reverse` | ✅ | Binary-element arrays fall back (Incompatible) ([details](compatibility/expressions/array.md)) | -| `size` | ✅ | MapType input falls back | + + --- ## conditional_funcs -| Function | Status | Notes | -| ------------ | ------ | --------------------------------------------------------------- | -| `coalesce` | ✅ | | -| `if` | ✅ | | -| `ifnull` | ✅ | | -| `nanvl` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `nullif` | ✅ | | -| `nullifzero` | ✅ | Lowers to `if`/`=` (Spark 4.0+) | -| `nvl` | ✅ | | -| `nvl2` | ✅ | | -| `when` | ✅ | | -| `zeroifnull` | ✅ | Lowers to `coalesce` (Spark 4.0+) | + + --- @@ -214,89 +101,15 @@ The tables below list every Spark built-in expression with its current status. The type-name conversion functions (`bigint`, `binary`, `boolean`, `date`, `decimal`, `double`, `float`, `int`, `smallint`, `string`, `timestamp`, `tinyint`) are SQL aliases for `CAST(... AS )` and share the support and caveats of `cast`. -| Function | Status | Notes | -| -------- | ------ | ----------------------------------------------------------------------------------------------- | -| `cast` | ✅ | Some casts fall back; float-to-decimal is opt-in ([details](compatibility/expressions/cast.md)) | + + --- ## datetime_funcs -| Function | Status | Notes | -| --------------------- | ------ | -------------------------------------------------------------------------------------------------------------------------------------- | -| `add_months` | ✅ | | -| `convert_timezone` | ✅ | | -| `curdate` | ✅ | Constant-folded to a literal (alias of `current_date`) | -| `current_date` | ✅ | Constant-folded to a literal before Comet sees the plan | -| `current_time` | 🔜 | Blocked on Spark 4.1 TIME type support ([#4288](https://github.com/apache/datafusion-comet/issues/4288)) | -| `current_timestamp` | ✅ | Constant-folded to a literal before Comet sees the plan | -| `current_timezone` | ✅ | | -| `date_add` | ✅ | | -| `date_diff` | ✅ | | -| `date_format` | ✅ | | -| `date_from_unix_date` | ✅ | | -| `date_part` | ✅ | | -| `date_sub` | ✅ | | -| `date_trunc` | ✅ | | -| `dateadd` | ✅ | | -| `datediff` | ✅ | | -| `datepart` | ✅ | | -| `day` | ✅ | | -| `dayname` | 🔜 | [#4544](https://github.com/apache/datafusion-comet/issues/4544) | -| `dayofmonth` | ✅ | | -| `dayofweek` | ✅ | | -| `dayofyear` | ✅ | | -| `extract` | ✅ | | -| `from_unixtime` | ✅ | | -| `from_utc_timestamp` | ✅ | Legacy zone forms fall back (Incompatible) ([details](compatibility/expressions/datetime.md)) | -| `hour` | ✅ | | -| `last_day` | ✅ | | -| `localtimestamp` | ✅ | | -| `make_date` | ✅ | | -| `make_dt_interval` | 🔜 | [#4541](https://github.com/apache/datafusion-comet/issues/4541) | -| `make_interval` | 🔜 | Produces legacy CalendarInterval; tracked by [#4540](https://github.com/apache/datafusion-comet/issues/4540) | -| `make_time` | 🔜 | Spark 4.1 TIME type; tracked by [#4288](https://github.com/apache/datafusion-comet/issues/4288) | -| `make_timestamp` | ✅ | | -| `make_timestamp_ltz` | ✅ | 2-arg TIME form falls back | -| `make_timestamp_ntz` | ✅ | 2-arg TIME form falls back | -| `make_ym_interval` | 🔜 | [#4541](https://github.com/apache/datafusion-comet/issues/4541) | -| `minute` | ✅ | | -| `month` | ✅ | | -| `monthname` | 🔜 | [#4544](https://github.com/apache/datafusion-comet/issues/4544) | -| `months_between` | ✅ | | -| `next_day` | ✅ | | -| `now` | ✅ | Constant-folded to a literal (alias of `current_timestamp`) | -| `quarter` | ✅ | | -| `second` | ✅ | | -| `session_window` | 🔜 | Time-window grouping; tracked by [#4553](https://github.com/apache/datafusion-comet/issues/4553) | -| `time_diff` | 🔜 | Spark 4.1 TIME type; tracked by [#4288](https://github.com/apache/datafusion-comet/issues/4288) | -| `time_trunc` | 🔜 | Spark 4.1 TIME type; tracked by [#4288](https://github.com/apache/datafusion-comet/issues/4288) | -| `timestamp_micros` | ✅ | | -| `timestamp_millis` | ✅ | | -| `timestamp_seconds` | ✅ | | -| `to_date` | ✅ | Rewrites to `Cast` (or `Cast(GetTimestamp)` with a format) before Comet sees the plan | -| `to_time` | 🔜 | Spark 4.1 TIME type; tracked by [#4288](https://github.com/apache/datafusion-comet/issues/4288) | -| `to_timestamp` | ✅ | Rewrites to `Cast` (or `GetTimestamp` with a format) before Comet sees the plan | -| `to_timestamp_ltz` | ✅ | Rewrites to `to_timestamp` (`TimestampType`) | -| `to_timestamp_ntz` | ✅ | Rewrites to `to_timestamp` (`TimestampNTZType`) | -| `to_unix_timestamp` | ✅ | | -| `to_utc_timestamp` | ✅ | Legacy zone forms fall back (Incompatible) ([details](compatibility/expressions/datetime.md)) | -| `trunc` | ✅ | | -| `try_make_interval` | 🔜 | Produces legacy CalendarInterval; tracked by [#4540](https://github.com/apache/datafusion-comet/issues/4540) | -| `try_make_timestamp` | ⚠️ | Returns a wrong value instead of NULL for invalid inputs ([#4554](https://github.com/apache/datafusion-comet/issues/4554)) | -| `try_to_date` | 🔜 | Rewrites to `Cast`/`GetTimestamp` but currently falls back; tracked by [#4556](https://github.com/apache/datafusion-comet/issues/4556) | -| `try_to_time` | 🔜 | Spark 4.1 TIME type; tracked by [#4288](https://github.com/apache/datafusion-comet/issues/4288) | -| `try_to_timestamp` | 🔜 | Rewrites to `Cast`/`GetTimestamp` but currently falls back; tracked by [#4556](https://github.com/apache/datafusion-comet/issues/4556) | -| `unix_date` | ✅ | | -| `unix_micros` | ✅ | | -| `unix_millis` | ✅ | | -| `unix_seconds` | ✅ | | -| `unix_timestamp` | ✅ | | -| `weekday` | ✅ | | -| `weekofyear` | ✅ | | -| `window` | 🔜 | Time-window grouping; tracked by [#4553](https://github.com/apache/datafusion-comet/issues/4553) | -| `window_time` | 🔜 | Time-window grouping; tracked by [#4553](https://github.com/apache/datafusion-comet/issues/4553) | -| `year` | ✅ | | + + --- @@ -306,43 +119,22 @@ The type-name conversion functions (`bigint`, `binary`, `boolean`, `date`, `deci expression-level). The `outer` variants are wired but marked `Incompatible`; they require `spark.comet.exec.explode.enabled=true` and `allowIncompatible`. -| Function | Status | Notes | -| ------------------ | ------ | ----------------------------------------------------------------------------------------------------------------------------- | -| `explode` | ✅ | via `CometExplodeExec` | -| `explode_outer` | ✅ | outer=true falls back (Incompatible) ([audit](../../contributor-guide/expression-audits/generator_funcs.md#explode_outer)) | -| `inline` | 🔜 | Operator-level generator (like `explode`) | -| `inline_outer` | 🔜 | Operator-level generator (like `explode`) | -| `posexplode` | ✅ | via `CometExplodeExec` | -| `posexplode_outer` | ✅ | outer=true falls back (Incompatible) ([audit](../../contributor-guide/expression-audits/generator_funcs.md#posexplode_outer)) | -| `stack` | 🔜 | Operator-level generator | + + --- ## hash_funcs -| Function | Status | Notes | -| ---------- | ------ | ----- | -| `crc32` | ✅ | | -| `hash` | ✅ | | -| `md5` | ✅ | | -| `sha` | ✅ | | -| `sha1` | ✅ | | -| `sha2` | ✅ | | -| `xxhash64` | ✅ | | + + --- ## json_funcs -| Function | Status | Notes | -| ------------------- | ------ | -------------------------------------------------------------------------------------------------------------------------------- | -| `from_json` | ✅ | Falls back by default; opt-in via allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#from_json)) | -| `get_json_object` | ✅ | Some inputs need allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#get_json_object)) | -| `json_array_length` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `json_object_keys` | 🔜 | [#3161](https://github.com/apache/datafusion-comet/issues/3161) | -| `json_tuple` | 🔜 | [#3160](https://github.com/apache/datafusion-comet/issues/3160) | -| `schema_of_json` | 🔜 | [#3163](https://github.com/apache/datafusion-comet/issues/3163) | -| `to_json` | ✅ | Options and map/array inputs fall back ([audit](../../contributor-guide/expression-audits/json_funcs.md#to_json)) | + + --- @@ -350,269 +142,64 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the All higher-order functions are planned via [#4224](https://github.com/apache/datafusion-comet/issues/4224). -| Function | Status | Notes | -| ------------------ | ------ | --------------------------------------------------------------- | -| `aggregate` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `array_sort` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `exists` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `filter` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `forall` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `map_filter` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `map_zip_with` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `reduce` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `transform` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `transform_keys` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `transform_values` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | -| `zip_with` | 🔜 | [#4224](https://github.com/apache/datafusion-comet/issues/4224) | + + --- ## map_funcs -| Function | Status | Notes | -| ------------------ | ------ | -------------------------------------------------------------------------------------------- | -| `element_at` | ✅ | MapType input falls back | -| `map` | 🔜 | Constructs a map | -| `map_concat` | 🔜 | Concatenates maps | -| `map_contains_key` | ✅ | | -| `map_entries` | ✅ | | -| `map_from_arrays` | ✅ | | -| `map_from_entries` | ✅ | BinaryType key/value falls back (Incompatible) ([details](compatibility/expressions/map.md)) | -| `map_keys` | ✅ | | -| `map_values` | ✅ | | -| `str_to_map` | ✅ | | -| `try_element_at` | ✅ | Lowers to `element_at`; array input (MapType falls back) | + + --- ## math_funcs -| Function | Status | Notes | -| -------------- | ------ | ---------------------------------------------------------------------------------------------------------------------------- | -| `%` | ✅ | try_mod (TRY mode) falls back | -| `*` | ✅ | Interval multiplication falls back | -| `+` | ✅ | | -| `-` | ✅ | | -| `/` | ✅ | | -| `abs` | ✅ | Interval types fall back | -| `acos` | ✅ | | -| `acosh` | ✅ | | -| `asin` | ✅ | | -| `asinh` | ✅ | | -| `atan` | ✅ | | -| `atan2` | ✅ | | -| `atanh` | ✅ | | -| `bin` | ✅ | | -| `bround` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `cbrt` | ✅ | | -| `ceil` | ✅ | Two-arg form falls back | -| `ceiling` | ✅ | | -| `conv` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `cos` | ✅ | | -| `cosh` | ✅ | | -| `cot` | ✅ | | -| `csc` | ✅ | | -| `degrees` | ✅ | | -| `div` | ✅ | | -| `e` | ✅ | Folds to a literal (like `pi`) | -| `exp` | ✅ | | -| `expm1` | ✅ | | -| `factorial` | ✅ | | -| `floor` | ✅ | Two-arg form falls back | -| `greatest` | ✅ | | -| `hex` | ✅ | | -| `hypot` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `least` | ✅ | | -| `ln` | ✅ | | -| `log` | ✅ | | -| `log10` | ✅ | | -| `log1p` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `log2` | ✅ | | -| `mod` | ✅ | | -| `negative` | ✅ | | -| `pi` | ✅ | | -| `pmod` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `positive` | ✅ | | -| `pow` | ✅ | | -| `power` | ✅ | | -| `radians` | ✅ | | -| `rand` | ✅ | | -| `randn` | ✅ | | -| `random` | ✅ | Alias for `rand` (Spark 4.0+); seed must be a literal | -| `randstr` | 🔜 | Random string (Spark 4.0+) | -| `rint` | ✅ | | -| `round` | ✅ | Float/double inputs fall back | -| `sec` | ✅ | | -| `shiftleft` | ✅ | | -| `sign` | ✅ | | -| `signum` | ✅ | | -| `sin` | ✅ | | -| `sinh` | ✅ | | -| `sqrt` | ✅ | | -| `tan` | ✅ | | -| `tanh` | ✅ | | -| `try_add` | ✅ | Datetime/interval form falls back | -| `try_divide` | ✅ | | -| `try_mod` | 🔜 | Lowers to `Remainder` with TRY eval mode, which falls back ([#4484](https://github.com/apache/datafusion-comet/issues/4484)) | -| `try_multiply` | ✅ | | -| `try_subtract` | ✅ | | -| `unhex` | ✅ | | -| `uniform` | ✅ | Constant-folded; literal arguments only (Spark 4.0+) | -| `width_bucket` | ✅ | | + + --- ## misc_funcs -| Function | Status | Notes | -| ----------------------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | -| `aes_decrypt` | 🔜 | Falls back; `StaticInvoke` not allowlisted; planned via codegen dispatch ([#4558](https://github.com/apache/datafusion-comet/issues/4558)) | -| `aes_encrypt` | 🔜 | Falls back; planned via codegen dispatch ([#4558](https://github.com/apache/datafusion-comet/issues/4558)); nondeterministic IV by default | -| `assert_true` | 🔜 | Lowers to `RaiseError`, which falls back | -| `current_catalog` | ✅ | Resolved to a literal by the analyzer (`ReplaceCurrentLike`) | -| `current_database` | ✅ | Resolved to a literal by the analyzer (`ReplaceCurrentLike`) | -| `current_schema` | ✅ | Alias of `current_database`; resolved to a literal by the analyzer | -| `current_user` | ✅ | Resolved to a literal by the analyzer; same as `user` | -| `equal_null` | ✅ | Lowers to `<=>` (`EqualNullSafe`) | -| `is_variant_null` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `monotonically_increasing_id` | ✅ | | -| `parse_json` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `raise_error` | 🔜 | Raises a runtime error | -| `rand` | ✅ | Seed must be a literal | -| `randn` | ✅ | Seed must be a literal | -| `schema_of_variant` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `schema_of_variant_agg` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `session_user` | ✅ | Alias of `current_user`; resolved to a literal by the analyzer | -| `spark_partition_id` | ✅ | | -| `to_variant_object` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `try_aes_decrypt` | 🔜 | Falls back; planned via codegen dispatch ([#4558](https://github.com/apache/datafusion-comet/issues/4558)) | -| `try_parse_json` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `try_variant_get` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `typeof` | ✅ | Foldable; resolved to a literal before Comet sees the plan | -| `user` | ✅ | Resolved to a literal by the Spark analyzer before reaching Comet | -| `uuid` | 🔜 | Nondeterministic random UUID | -| `variant_get` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | + + --- ## predicate_funcs -| Function | Status | Notes | -| ------------- | ------ | --------------------------------------------------------------------------------------- | -| `!` | ✅ | | -| `<` | ✅ | | -| `<=` | ✅ | | -| `<=>` | ✅ | | -| `=` | ✅ | | -| `==` | ✅ | | -| `>` | ✅ | | -| `>=` | ✅ | | -| `and` | ✅ | | -| `between` | ✅ | | -| `ilike` | ✅ | | -| `in` | ✅ | | -| `isnan` | ✅ | | -| `isnotnull` | ✅ | | -| `isnull` | ✅ | | -| `like` | ✅ | | -| `not` | ✅ | | -| `or` | ✅ | | -| `regexp` | ✅ | Falls back by default; opt-in via allowIncompatible ([details](compatibility/regex.md)) | -| `regexp_like` | ✅ | Falls back by default; opt-in via allowIncompatible ([details](compatibility/regex.md)) | -| `rlike` | ✅ | Falls back by default; opt-in via allowIncompatible ([details](compatibility/regex.md)) | + + --- ## string_funcs -| Function | Status | Notes | -| -------------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | -| `ascii` | ✅ | | -| `base64` | 🔜 | Lowers to `StaticInvoke(encode)` (not allowlisted); falls back | -| `bit_length` | ✅ | | -| `btrim` | ✅ | | -| `char` | ✅ | | -| `char_length` | ✅ | | -| `character_length` | ✅ | | -| `chr` | ✅ | | -| `collate` | 🔜 | Spark collation (umbrella [#2190](https://github.com/apache/datafusion-comet/issues/2190)) | -| `collation` | ✅ | Constant-folded to a literal (Spark 4.0+) | -| `concat_ws` | ✅ | | -| `contains` | ✅ | | -| `decode` | ✅ | | -| `elt` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `encode` | 🔜 | Lowers to `StaticInvoke(encode)` (not allowlisted); falls back | -| `endswith` | ✅ | | -| `find_in_set` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `format_number` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `format_string` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `initcap` | ✅ | | -| `instr` | ✅ | | -| `lcase` | ✅ | | -| `left` | ✅ | | -| `len` | ✅ | | -| `length` | ✅ | | -| `levenshtein` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `locate` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `lower` | ✅ | | -| `lpad` | ✅ | | -| `ltrim` | ✅ | | -| `luhn_check` | ✅ | Native via `StaticInvoke` (tests: luhn_check.sql) | -| `mask` | 🔜 | Data masking | -| `octet_length` | ✅ | | -| `overlay` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `position` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `printf` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `regexp_count` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `regexp_extract` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `regexp_extract_all` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `regexp_instr` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `regexp_replace` | ✅ | | -| `regexp_substr` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) | -| `repeat` | ✅ | | -| `replace` | ✅ | | -| `right` | ✅ | | -| `rpad` | ✅ | | -| `rtrim` | ✅ | | -| `soundex` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `space` | ✅ | | -| `split` | ✅ | | -| `split_part` | 🔜 | Lowers to `element_at(StringSplitSQL(...))`; `StringSplitSQL` falls back ([#4561](https://github.com/apache/datafusion-comet/issues/4561)) | -| `startswith` | ✅ | | -| `substr` | ✅ | | -| `substring` | ✅ | | -| `substring_index` | ✅ | | -| `to_binary` | ✅ | Hex form accelerated; other formats fall back | -| `to_char` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `to_number` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `to_varchar` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `translate` | ✅ | | -| `trim` | ✅ | | -| `try_to_binary` | 🔜 | Lowers to `TryEval(...)`, which falls back | -| `try_to_number` | 🔜 | TRY variant of `to_number` | -| `ucase` | ✅ | | -| `unbase64` | 🔜 | [#4538](https://github.com/apache/datafusion-comet/issues/4538) | -| `upper` | ✅ | | + + --- ## struct_funcs -| Function | Status | Notes | -| -------------- | ------ | ------------------------------- | -| `named_struct` | ✅ | Duplicate field names fall back | -| `struct` | ✅ | | + + --- ## url_funcs -| Function | Status | Notes | -| ---------------- | ------ | ----- | -| `parse_url` | ✅ | | -| `try_url_decode` | ✅ | | -| `url_decode` | ✅ | | -| `url_encode` | ✅ | | + + + +--- + +## variant_funcs + + + --- @@ -625,17 +212,8 @@ When enabled, `lag` and `lead` are explicitly wired; aggregate window functions `ntile`, `percent_rank`, `cume_dist`, `nth_value`) are not yet wired in the window serde and fall back to Spark. -| Function | Status | Notes | -| -------------- | ------ | ------------------------------------------------------------------------------------------- | -| `cume_dist` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | -| `dense_rank` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | -| `lag` | ✅ | via `CometWindowExec` | -| `lead` | ✅ | via `CometWindowExec` | -| `nth_value` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | -| `ntile` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | -| `percent_rank` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | -| `rank` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | -| `row_number` | 🔜 | Window function; tracked by [#2721](https://github.com/apache/datafusion-comet/issues/2721) | + + --- @@ -653,5 +231,5 @@ This list is illustrative, not exhaustive: the per-function tables are not the c ## See also -- [Comet Compatibility Guide](compatibility/index.md) - known incompatibilities and edge cases for ⚠️ expressions. +- [Comet Compatibility Guide](compatibility/index.md) - known incompatibilities and edge cases for expressions with opt-in incompatible behavior. - [Expression Audits (contributor guide)](../../contributor-guide/expression-audits/index.md) - per-version (Spark 3.4 / 3.5 / 4.0 / 4.1) audit notes for audited expressions. diff --git a/spark/src/main/scala/org/apache/comet/ExpressionReference.scala b/spark/src/main/scala/org/apache/comet/ExpressionReference.scala new file mode 100644 index 0000000000..3fbcabb83d --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/ExpressionReference.scala @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry + +/** + * Pure helpers for generating the Spark expression reference table (`expressions.md`). No file IO + * or SparkSession dependency, so it is unit-testable in isolation. The impure parts (enumerating + * Spark's FunctionRegistry, reading the serde maps, writing files) live in [[GenerateDocs]]. + */ +object ExpressionReference { + + /** Status shown in the reference table. */ + sealed trait ExprStatus { def symbol: String } + // scalastyle:off nonascii + case object Supported extends ExprStatus { override val symbol = "✅" } + case object Planned extends ExprStatus { override val symbol = "🔜" } + case object NotPlanned extends ExprStatus { override val symbol = "💤" } + // scalastyle:on nonascii + + /** + * A built-in that is neither serde-backed nor listed in `plannedExpressions`. Rendered with a + * warning and the same "🔜" glyph as [[Planned]] on purpose: an unclassified function is + * treated as provisionally planned in the table, and its Notes cell ("unclassified; not yet + * reviewed") is what distinguishes it from a deliberately planned entry. + */ + // scalastyle:off nonascii + case object Unclassified extends ExprStatus { override val symbol = "🔜" } + // scalastyle:on nonascii + + /** + * Curated metadata for a function described by the `plannedExpressions` map in [[GenerateDocs]] + * (a CI-exempt file). `status` is a curated status (Supported, Planned, or NotPlanned). + * Supported is used for functions Comet supports through a non-serde path (an analyzer rewrite, + * constant-folding, or an operator) rather than a serde-backed expression. The only status that + * may not appear here is the placeholder [[Unclassified]]. + */ + case class PlannedExpr( + status: ExprStatus, + issue: Option[Int] = None, + note: Option[String] = None) { + require(status != Unclassified, s"curated status must not be Unclassified, got $status") + } + + /** + * Serde-derived doc facts for one expression class. `hasCompatContent` is true when the serde + * produces any compat-guide content (a non-empty getCompatibleNotes / getIncompatibleReasons / + * getUnsupportedReasons), which is what gates whether the reference table emits a compat-guide + * link. + */ + case class SerdeDocInfo( + summary: Option[String], + hasCompatContent: Boolean, + category: Option[String], + anchor: String) + + /** One Spark built-in as seen in FunctionRegistry. */ + case class FunctionEntry(name: String, group: String, className: String) + + /** A fully resolved row ready to render. */ + case class ReferenceRow(name: String, status: ExprStatus, notes: String) + + /** + * Enumerate Spark's built-in functions using the public FunctionRegistry API. Uses + * `builtin.listFunction` + `lookupFunction` (both public) rather than the `expressions` val, + * whose visibility varies across Spark versions. + */ + def builtinFunctions(): Seq[FunctionEntry] = { + val registry = FunctionRegistry.builtin + registry.listFunction().flatMap { id => + registry.lookupFunction(id).map { info => + FunctionEntry( + name = id.funcName, + group = Option(info.getGroup).getOrElse(""), + className = info.getClassName) + } + } + } + + private def issueLink(n: Int): String = + s"[#$n](https://github.com/apache/datafusion-comet/issues/$n)" + + /** Render one row with single-space padding (matches the .prettierignore'd table style). */ + def renderRow(r: ReferenceRow): String = + s"| `${r.name}` | ${r.status.symbol} | ${r.notes} |" + + /** Render a full group table (header + separator + sorted rows). */ + def renderTable(rows: Seq[ReferenceRow]): String = { + val header = Seq("| Function | Status | Notes |", "| --- | --- | --- |") + (header ++ rows.sortBy(_.name).map(renderRow)).mkString("\n") + } + + /** + * Resolve one function to a row, plus an optional warning string when the function is + * unclassified (no serde and not in the planned list). + */ + def resolveRow( + entry: FunctionEntry, + serde: Option[SerdeDocInfo], + planned: Option[PlannedExpr]): (ReferenceRow, Option[String]) = { + serde match { + case Some(info) => + val link = + if (info.hasCompatContent && info.category.isDefined) { + Some(s"[details](compatibility/expressions/${info.category.get}.md#${info.anchor})") + } else { + None + } + val notes = (info.summary ++ link).mkString(" ") + (ReferenceRow(entry.name, Supported, notes), None) + case None => + planned match { + case Some(p) => + val notes = (p.note ++ p.issue.map(issueLink)).mkString(" ") + (ReferenceRow(entry.name, p.status, notes), None) + case None => + ( + ReferenceRow(entry.name, Unclassified, "unclassified; not yet reviewed"), + Some(s"Unclassified Spark builtin: ${entry.name} (group ${entry.group})")) + } + } + } +} diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 43f0016d79..4f924c729c 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -23,10 +23,12 @@ import java.io.{BufferedOutputStream, BufferedReader, FileOutputStream, FileRead import scala.collection.mutable import scala.collection.mutable.ListBuffer +import scala.util.Try -import org.apache.spark.sql.catalyst.expressions.Cast +import org.apache.spark.sql.catalyst.expressions.{Cast, Expression} import org.apache.comet.CometConf.COMET_ONHEAP_MEMORY_OVERHEAD +import org.apache.comet.ExpressionReference._ import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.{Compatible, Incompatible, QueryPlanSerde, Unsupported} @@ -141,6 +143,556 @@ object GenerateDocs { serde.getUnsupportedReasons()) }))) + /** + * Curated status for Spark built-ins that Comet does not serde-support. This list lives in + * GenerateDocs (not in the serde files) on purpose: it is excluded from the heavy CI path + * filters (build, spark-sql, iceberg) in dev/ci/compute-changes.py, so editing it (e.g. when an + * issue is filed) does not trigger those heavy jobs. Keyed by Spark function name. + */ + private val plannedExpressions: Map[String, PlannedExpr] = Map( + "aes_decrypt" -> PlannedExpr( + Planned, + issue = Some(4558), + note = Some("Falls back; `StaticInvoke` not allowlisted; planned via codegen dispatch")), + "aes_encrypt" -> PlannedExpr( + Planned, + issue = Some(4558), + note = Some("Falls back; planned via codegen dispatch; nondeterministic IV by default")), + "aggregate" -> PlannedExpr(Planned, issue = Some(4224)), + "any" -> PlannedExpr(Supported), + "any_value" -> PlannedExpr(Supported), + "approx_count_distinct" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "approx_percentile" -> PlannedExpr(NotPlanned), + "approx_top_k" -> PlannedExpr(NotPlanned), + "approx_top_k_accumulate" -> PlannedExpr(NotPlanned), + "approx_top_k_combine" -> PlannedExpr(NotPlanned), + "approx_top_k_estimate" -> PlannedExpr(NotPlanned), + "array_agg" -> PlannedExpr( + Planned, + issue = Some(2524), + note = Some("Array aggregate (related to `collect_list`)")), + "array_prepend" -> PlannedExpr(Planned, note = Some("Sibling of `array_append`")), + "array_size" -> PlannedExpr(Supported), + "array_sort" -> PlannedExpr(Planned, issue = Some(4224)), + "assert_true" -> PlannedExpr( + Planned, + note = Some("Lowers to `RaiseError`, which falls back")), + "base64" -> PlannedExpr( + Planned, + note = Some("Lowers to `StaticInvoke(encode)` (not allowlisted); falls back")), + "between" -> PlannedExpr(Supported), + "bitmap_and_agg" -> PlannedExpr(NotPlanned), + "bitmap_bit_position" -> PlannedExpr(NotPlanned), + "bitmap_bucket_number" -> PlannedExpr(NotPlanned), + "bitmap_construct_agg" -> PlannedExpr(NotPlanned), + "bitmap_count" -> PlannedExpr(NotPlanned), + "bitmap_or_agg" -> PlannedExpr(NotPlanned), + "bool_and" -> PlannedExpr(Supported), + "bool_or" -> PlannedExpr(Supported), + "bround" -> PlannedExpr(Planned, issue = Some(4538)), + "ceil" -> PlannedExpr(Supported, note = Some("Two-arg form falls back")), + "ceiling" -> PlannedExpr(Supported), + "collate" -> PlannedExpr( + Planned, + issue = Some(2190), + note = Some("Spark collation (umbrella)")), + "collation" -> PlannedExpr( + Supported, + note = Some("Constant-folded to a literal (Spark 4.0+)")), + "collect_list" -> PlannedExpr(Planned, issue = Some(2524)), + "contains" -> PlannedExpr(Supported), + "conv" -> PlannedExpr(Planned, issue = Some(4538)), + "count_if" -> PlannedExpr(Supported), + "count_min_sketch" -> PlannedExpr(NotPlanned), + "cume_dist" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "curdate" -> PlannedExpr( + Supported, + note = Some("Constant-folded to a literal (alias of `current_date`)")), + "current_catalog" -> PlannedExpr( + Supported, + note = Some("Resolved to a literal by the analyzer (`ReplaceCurrentLike`)")), + "current_database" -> PlannedExpr( + Supported, + note = Some("Resolved to a literal by the analyzer (`ReplaceCurrentLike`)")), + "current_date" -> PlannedExpr( + Supported, + note = Some("Constant-folded to a literal before Comet sees the plan")), + "current_schema" -> PlannedExpr( + Supported, + note = Some("Alias of `current_database`; resolved to a literal by the analyzer")), + "current_time" -> PlannedExpr( + Planned, + issue = Some(4288), + note = Some("Blocked on Spark 4.1 TIME type support")), + "current_timestamp" -> PlannedExpr( + Supported, + note = Some("Constant-folded to a literal before Comet sees the plan")), + "current_timezone" -> PlannedExpr(Supported), + "current_user" -> PlannedExpr( + Supported, + note = Some("Resolved to a literal by the analyzer; same as `user`")), + "date_part" -> PlannedExpr(Supported), + "datepart" -> PlannedExpr(Supported), + "dayname" -> PlannedExpr(Planned, issue = Some(4544)), + "decode" -> PlannedExpr(Supported), + "dense_rank" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "e" -> PlannedExpr(Supported, note = Some("Folds to a literal (like `pi`)")), + "elt" -> PlannedExpr(Planned, issue = Some(4538)), + "encode" -> PlannedExpr( + Planned, + note = Some("Lowers to `StaticInvoke(encode)` (not allowlisted); falls back")), + "endswith" -> PlannedExpr(Supported), + "equal_null" -> PlannedExpr(Supported, note = Some("Lowers to `<=>` (`EqualNullSafe`)")), + "every" -> PlannedExpr(Supported), + "exists" -> PlannedExpr(Planned, issue = Some(4224)), + "explode" -> PlannedExpr(Supported, note = Some("via `CometExplodeExec`")), + "explode_outer" -> PlannedExpr( + Supported, + note = Some("outer=true falls back (Incompatible) " + + "([audit](../../contributor-guide/expression-audits/generator_funcs.md#explode_outer))")), + "extract" -> PlannedExpr(Supported), + "find_in_set" -> PlannedExpr(Planned, issue = Some(4538)), + "floor" -> PlannedExpr(Supported, note = Some("Two-arg form falls back")), + "forall" -> PlannedExpr(Planned, issue = Some(4224)), + "format_number" -> PlannedExpr(Planned, issue = Some(4538)), + "format_string" -> PlannedExpr(Planned, issue = Some(4538)), + "from_avro" -> PlannedExpr(NotPlanned), + "from_protobuf" -> PlannedExpr(NotPlanned), + "get" -> PlannedExpr(Supported), + "grouping" -> PlannedExpr( + Planned, + note = Some("Grouping indicator for ROLLUP/CUBE/GROUPING SETS")), + "grouping_id" -> PlannedExpr( + Planned, + note = Some("Grouping indicator for ROLLUP/CUBE/GROUPING SETS")), + "histogram_numeric" -> PlannedExpr(NotPlanned), + "hll_sketch_agg" -> PlannedExpr(NotPlanned), + "hll_sketch_estimate" -> PlannedExpr(NotPlanned), + "hll_union" -> PlannedExpr(NotPlanned), + "hll_union_agg" -> PlannedExpr(NotPlanned), + "hour" -> PlannedExpr(Supported), + "hypot" -> PlannedExpr(Planned, issue = Some(4538)), + "ifnull" -> PlannedExpr(Supported), + "ilike" -> PlannedExpr(Supported), + "inline" -> PlannedExpr(Planned, note = Some("Operator-level generator (like `explode`)")), + "inline_outer" -> PlannedExpr( + Planned, + note = Some("Operator-level generator (like `explode`)")), + "input_file_block_length" -> PlannedExpr(NotPlanned), + "input_file_block_start" -> PlannedExpr(NotPlanned), + "input_file_name" -> PlannedExpr(NotPlanned), + "is_valid_utf8" -> PlannedExpr(NotPlanned), + "is_variant_null" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "java_method" -> PlannedExpr(NotPlanned), + "json_array_length" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "json_object_keys" -> PlannedExpr(Planned, issue = Some(3161)), + "json_tuple" -> PlannedExpr(Planned, issue = Some(3160)), + "kll_merge_agg_bigint" -> PlannedExpr(NotPlanned), + "kll_merge_agg_double" -> PlannedExpr(NotPlanned), + "kll_merge_agg_float" -> PlannedExpr(NotPlanned), + "kll_sketch_agg_bigint" -> PlannedExpr(NotPlanned), + "kll_sketch_agg_double" -> PlannedExpr(NotPlanned), + "kll_sketch_agg_float" -> PlannedExpr(NotPlanned), + "kll_sketch_get_n_bigint" -> PlannedExpr(NotPlanned), + "kll_sketch_get_n_double" -> PlannedExpr(NotPlanned), + "kll_sketch_get_n_float" -> PlannedExpr(NotPlanned), + "kll_sketch_get_quantile_bigint" -> PlannedExpr(NotPlanned), + "kll_sketch_get_quantile_double" -> PlannedExpr(NotPlanned), + "kll_sketch_get_quantile_float" -> PlannedExpr(NotPlanned), + "kll_sketch_get_rank_bigint" -> PlannedExpr(NotPlanned), + "kll_sketch_get_rank_double" -> PlannedExpr(NotPlanned), + "kll_sketch_get_rank_float" -> PlannedExpr(NotPlanned), + "kll_sketch_merge_bigint" -> PlannedExpr(NotPlanned), + "kll_sketch_merge_double" -> PlannedExpr(NotPlanned), + "kll_sketch_merge_float" -> PlannedExpr(NotPlanned), + "kll_sketch_to_string_bigint" -> PlannedExpr(NotPlanned), + "kll_sketch_to_string_double" -> PlannedExpr(NotPlanned), + "kll_sketch_to_string_float" -> PlannedExpr(NotPlanned), + "kurtosis" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "lag" -> PlannedExpr(Supported, note = Some("via `CometWindowExec`")), + "lead" -> PlannedExpr(Supported, note = Some("via `CometWindowExec`")), + "levenshtein" -> PlannedExpr(Planned, issue = Some(4538)), + "listagg" -> PlannedExpr(Planned, note = Some("String aggregation")), + "localtimestamp" -> PlannedExpr(Supported), + "locate" -> PlannedExpr(Planned, issue = Some(4538)), + "log1p" -> PlannedExpr(Planned, issue = Some(4538)), + "lpad" -> PlannedExpr(Supported), + "luhn_check" -> PlannedExpr( + Supported, + note = Some("Native via `StaticInvoke` (tests: luhn_check.sql)")), + "make_dt_interval" -> PlannedExpr(Planned, issue = Some(4541)), + "make_interval" -> PlannedExpr( + Planned, + issue = Some(4540), + note = Some("Produces legacy CalendarInterval; tracked by")), + "make_time" -> PlannedExpr( + Planned, + issue = Some(4288), + note = Some("Spark 4.1 TIME type; tracked by")), + "make_timestamp" -> PlannedExpr(Supported), + "make_timestamp_ltz" -> PlannedExpr(Supported, note = Some("2-arg TIME form falls back")), + "make_timestamp_ntz" -> PlannedExpr(Supported, note = Some("2-arg TIME form falls back")), + "make_valid_utf8" -> PlannedExpr(NotPlanned), + "make_ym_interval" -> PlannedExpr(Planned, issue = Some(4541)), + "map" -> PlannedExpr(Planned, note = Some("Constructs a map")), + "map_concat" -> PlannedExpr(Planned, note = Some("Concatenates maps")), + "map_filter" -> PlannedExpr(Planned, issue = Some(4224)), + "map_zip_with" -> PlannedExpr(Planned, issue = Some(4224)), + "mask" -> PlannedExpr(Planned, note = Some("Data masking")), + "max_by" -> PlannedExpr(Planned, issue = Some(3841)), + "median" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "min_by" -> PlannedExpr(Planned, issue = Some(3841)), + "minute" -> PlannedExpr(Supported), + "mode" -> PlannedExpr(Planned, issue = Some(3970)), + "monthname" -> PlannedExpr(Planned, issue = Some(4544)), + "nanvl" -> PlannedExpr(Planned, issue = Some(4538)), + "now" -> PlannedExpr( + Supported, + note = Some("Constant-folded to a literal (alias of `current_timestamp`)")), + "nth_value" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "ntile" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "nullif" -> PlannedExpr(Supported), + "nullifzero" -> PlannedExpr(Supported, note = Some("Lowers to `if`/`=` (Spark 4.0+)")), + "nvl" -> PlannedExpr(Supported), + "nvl2" -> PlannedExpr(Supported), + "overlay" -> PlannedExpr(Planned, issue = Some(4538)), + "parse_json" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "percent_rank" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "percentile" -> PlannedExpr(Planned, issue = Some(4542)), + "percentile_approx" -> PlannedExpr(NotPlanned), + "percentile_cont" -> PlannedExpr(Planned, note = Some("Percentile aggregate")), + "percentile_disc" -> PlannedExpr(Planned, note = Some("Percentile aggregate")), + "pmod" -> PlannedExpr(Planned, issue = Some(4538)), + "posexplode" -> PlannedExpr(Supported, note = Some("via `CometExplodeExec`")), + "posexplode_outer" -> PlannedExpr( + Supported, + note = Some( + "outer=true falls back (Incompatible) " + + "([audit](../../contributor-guide/expression-audits/" + + "generator_funcs.md#posexplode_outer))")), + "position" -> PlannedExpr(Planned, issue = Some(4538)), + "positive" -> PlannedExpr(Supported), + "printf" -> PlannedExpr(Planned, issue = Some(4538)), + "quote" -> PlannedExpr(NotPlanned), + "raise_error" -> PlannedExpr(Planned, note = Some("Raises a runtime error")), + "randstr" -> PlannedExpr(Planned, note = Some("Random string (Spark 4.0+)")), + "rank" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "reduce" -> PlannedExpr(Planned, issue = Some(4224)), + "reflect" -> PlannedExpr(NotPlanned), + "regexp_count" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "regexp_extract" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "regexp_extract_all" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "regexp_instr" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "regexp_substr" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "regr_avgx" -> PlannedExpr( + Supported, + note = Some( + "Native: Spark rewrites to `Average` " + + "(tests in [#4551](https://github.com/apache/datafusion-comet/issues/4551))")), + "regr_avgy" -> PlannedExpr( + Supported, + note = Some( + "Native: Spark rewrites to `Average` " + + "(tests in [#4551](https://github.com/apache/datafusion-comet/issues/4551))")), + "regr_count" -> PlannedExpr( + Supported, + note = Some( + "Native: Spark rewrites to `Count` " + + "(tests in [#4551](https://github.com/apache/datafusion-comet/issues/4551))")), + "regr_intercept" -> PlannedExpr( + Planned, + issue = Some(4552), + note = Some("Falls back; can reuse `covar_pop`/`var_pop` accumulators")), + "regr_r2" -> PlannedExpr( + Planned, + issue = Some(4552), + note = Some("Falls back; can reuse the `corr` accumulator")), + "regr_slope" -> PlannedExpr( + Planned, + issue = Some(4552), + note = Some("Falls back; can reuse `covar_pop`/`var_pop` accumulators")), + "regr_sxx" -> PlannedExpr( + Planned, + issue = Some(4552), + note = Some("Falls back; can reuse `var_pop` accumulator")), + "regr_sxy" -> PlannedExpr( + Planned, + issue = Some(4552), + note = Some("Falls back; can reuse `covar_pop` accumulator")), + "regr_syy" -> PlannedExpr( + Planned, + issue = Some(4552), + note = Some("Falls back; can reuse `var_pop` accumulator")), + "row_number" -> PlannedExpr( + Planned, + issue = Some(2721), + note = Some("Window function; tracked by")), + "rpad" -> PlannedExpr(Supported), + "schema_of_avro" -> PlannedExpr(NotPlanned), + "schema_of_json" -> PlannedExpr(Planned, issue = Some(3163)), + "schema_of_variant" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "schema_of_variant_agg" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "second" -> PlannedExpr(Supported), + "sentences" -> PlannedExpr(NotPlanned), + "sequence" -> PlannedExpr(Planned, issue = Some(4538)), + "session_user" -> PlannedExpr( + Supported, + note = Some("Alias of `current_user`; resolved to a literal by the analyzer")), + "session_window" -> PlannedExpr( + Planned, + issue = Some(4553), + note = Some("Time-window grouping; tracked by")), + "shuffle" -> PlannedExpr(Planned, note = Some("Random array shuffle")), + "skewness" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "some" -> PlannedExpr(Supported), + "soundex" -> PlannedExpr(Planned, issue = Some(4538)), + "split_part" -> PlannedExpr( + Planned, + issue = Some(4561), + note = Some("Lowers to `element_at(StringSplitSQL(...))`; `StringSplitSQL` falls back")), + "stack" -> PlannedExpr(Planned, note = Some("Operator-level generator")), + "startswith" -> PlannedExpr(Supported), + "string_agg" -> PlannedExpr(Planned, note = Some("String aggregation (alias of `listagg`)")), + "theta_difference" -> PlannedExpr(NotPlanned), + "theta_intersection" -> PlannedExpr(NotPlanned), + "theta_intersection_agg" -> PlannedExpr(NotPlanned), + "theta_sketch_agg" -> PlannedExpr(NotPlanned), + "theta_sketch_estimate" -> PlannedExpr(NotPlanned), + "theta_union" -> PlannedExpr(NotPlanned), + "theta_union_agg" -> PlannedExpr(NotPlanned), + "time_diff" -> PlannedExpr( + Planned, + issue = Some(4288), + note = Some("Spark 4.1 TIME type; tracked by")), + "time_trunc" -> PlannedExpr( + Planned, + issue = Some(4288), + note = Some("Spark 4.1 TIME type; tracked by")), + "to_avro" -> PlannedExpr(NotPlanned), + "to_binary" -> PlannedExpr( + Supported, + note = Some("Hex form accelerated; other formats fall back")), + "to_char" -> PlannedExpr(Planned, issue = Some(4538)), + "to_date" -> PlannedExpr( + Supported, + note = Some( + "Rewrites to `Cast` (or `Cast(GetTimestamp)` with a format) before Comet sees the plan")), + "to_number" -> PlannedExpr(Planned, issue = Some(4538)), + "to_protobuf" -> PlannedExpr(NotPlanned), + "to_time" -> PlannedExpr( + Planned, + issue = Some(4288), + note = Some("Spark 4.1 TIME type; tracked by")), + "to_timestamp" -> PlannedExpr( + Supported, + note = + Some("Rewrites to `Cast` (or `GetTimestamp` with a format) before Comet sees the plan")), + "to_timestamp_ltz" -> PlannedExpr( + Supported, + note = Some("Rewrites to `to_timestamp` (`TimestampType`)")), + "to_timestamp_ntz" -> PlannedExpr( + Supported, + note = Some("Rewrites to `to_timestamp` (`TimestampNTZType`)")), + "to_varchar" -> PlannedExpr(Planned, issue = Some(4538)), + "to_variant_object" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "transform" -> PlannedExpr(Planned, issue = Some(4224)), + "transform_keys" -> PlannedExpr(Planned, issue = Some(4224)), + "transform_values" -> PlannedExpr(Planned, issue = Some(4224)), + "try_add" -> PlannedExpr(Supported, note = Some("Datetime/interval form falls back")), + "try_aes_decrypt" -> PlannedExpr( + Planned, + issue = Some(4558), + note = Some("Falls back; planned via codegen dispatch")), + "try_avg" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "try_divide" -> PlannedExpr(Supported), + "try_element_at" -> PlannedExpr( + Supported, + note = Some("Lowers to `element_at`; array input (MapType falls back)")), + "try_make_interval" -> PlannedExpr( + Planned, + issue = Some(4540), + note = Some("Produces legacy CalendarInterval; tracked by")), + "try_make_timestamp" -> PlannedExpr( + Supported, + issue = Some(4554), + note = Some("Returns a wrong value instead of NULL for invalid inputs")), + "try_make_timestamp_ltz" -> PlannedExpr(NotPlanned), + "try_make_timestamp_ntz" -> PlannedExpr(NotPlanned), + "try_mod" -> PlannedExpr( + Planned, + issue = Some(4484), + note = Some("Lowers to `Remainder` with TRY eval mode, which falls back")), + "try_multiply" -> PlannedExpr(Supported), + "try_parse_json" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "try_parse_url" -> PlannedExpr(NotPlanned), + "try_reflect" -> PlannedExpr(NotPlanned), + "try_subtract" -> PlannedExpr(Supported), + "try_sum" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "try_to_binary" -> PlannedExpr( + Planned, + note = Some("Lowers to `TryEval(...)`, which falls back")), + "try_to_date" -> PlannedExpr( + Planned, + issue = Some(4556), + note = Some("Rewrites to `Cast`/`GetTimestamp` but currently falls back; tracked by")), + "try_to_number" -> PlannedExpr(Planned, note = Some("TRY variant of `to_number`")), + "try_to_time" -> PlannedExpr( + Planned, + issue = Some(4288), + note = Some("Spark 4.1 TIME type; tracked by")), + "try_to_timestamp" -> PlannedExpr( + Planned, + issue = Some(4556), + note = Some("Rewrites to `Cast`/`GetTimestamp` but currently falls back; tracked by")), + "try_url_decode" -> PlannedExpr(Supported), + "try_validate_utf8" -> PlannedExpr(NotPlanned), + "try_variant_get" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "typeof" -> PlannedExpr( + Supported, + note = Some("Foldable; resolved to a literal before Comet sees the plan")), + "unbase64" -> PlannedExpr(Planned, issue = Some(4538)), + "uniform" -> PlannedExpr( + Supported, + note = Some("Constant-folded; literal arguments only (Spark 4.0+)")), + "url_decode" -> PlannedExpr(Supported), + "url_encode" -> PlannedExpr(Supported), + "user" -> PlannedExpr( + Supported, + note = Some("Resolved to a literal by the Spark analyzer before reaching Comet")), + "uuid" -> PlannedExpr(Planned, note = Some("Nondeterministic random UUID")), + "validate_utf8" -> PlannedExpr(NotPlanned), + "variant_get" -> PlannedExpr(Planned, issue = Some(4098), note = Some("tracking")), + "version" -> PlannedExpr(NotPlanned), + "width_bucket" -> PlannedExpr(Supported), + "window" -> PlannedExpr( + Planned, + issue = Some(4553), + note = Some("Time-window grouping; tracked by")), + "window_time" -> PlannedExpr( + Planned, + issue = Some(4553), + note = Some("Time-window grouping; tracked by")), + "zeroifnull" -> PlannedExpr(Supported, note = Some("Lowers to `coalesce` (Spark 4.0+)")), + "zip_with" -> PlannedExpr(Planned, issue = Some(4224))) + + /** + * Spark function groups rendered as tables, in display order. Families that fall back wholesale + * (xml_funcs, csv_funcs, geospatial, etc.) are intentionally omitted; they are covered by the + * "Not currently planned" prose section. + * + * Consumed by `generateExpressionReference` (added in the following task). + */ + private val expressionGroups: Seq[String] = Seq( + "agg_funcs", + "array_funcs", + "bitwise_funcs", + "collection_funcs", + "conditional_funcs", + "conversion_funcs", + "datetime_funcs", + "generator_funcs", + "hash_funcs", + "json_funcs", + "lambda_funcs", + "map_funcs", + "math_funcs", + "misc_funcs", + "predicate_funcs", + "string_funcs", + "struct_funcs", + "url_funcs", + "variant_funcs", + "window_funcs") + + /** + * Map expression class -> compat-guide category, only for categories that have a page. Must + * stay in sync with `categoryPages`: only categories that have a compat-guide page belong here, + * so functions in other categories intentionally get no compat link. + */ + private val classToCategory: Map[Class[_], String] = Seq( + QueryPlanSerde.arrayExpressions.keys.map((_: Class[_]) -> "array"), + QueryPlanSerde.temporalExpressions.keys.map((_: Class[_]) -> "datetime"), + QueryPlanSerde.mathExpressions.keys.map((_: Class[_]) -> "math"), + QueryPlanSerde.structExpressions.keys.map((_: Class[_]) -> "struct"), + QueryPlanSerde.stringExpressions.keys.map((_: Class[_]) -> "string"), + QueryPlanSerde.mapExpressions.keys.map((_: Class[_]) -> "map"), + QueryPlanSerde.miscExpressions.keys.map((_: Class[_]) -> "misc"), + QueryPlanSerde.urlExpressions.keys.map((_: Class[_]) -> "url"), + QueryPlanSerde.aggrSerdeMap.keys.map((_: Class[_]) -> "aggregate")).flatten.toMap + + /** Build the serde-derived doc facts for a function class, if Comet serde-supports it. */ + private def serdeDocInfoFor(className: String): Option[SerdeDocInfo] = { + // scalastyle:off classforname + val clsOpt = Try(Class.forName(className)).toOption + // scalastyle:on classforname + clsOpt.flatMap { cls => + // The cast is erased at runtime; the lookup is by key equality, so a class that is + // not actually an Expression subtype simply matches no key and yields None. + val exprSerde = QueryPlanSerde.exprSerdeMap + .get(cls.asInstanceOf[Class[_ <: Expression]]) + val aggSerde = QueryPlanSerde.aggrSerdeMap.get(cls) + val notesAndSummary: Option[(Option[String], Boolean)] = exprSerde match { + case Some(s) => + Some( + ( + s.getExpressionSummary, + s.getCompatibleNotes().nonEmpty || s.getIncompatibleReasons().nonEmpty || + s.getUnsupportedReasons().nonEmpty)) + case None => + aggSerde.map { s => + ( + s.getExpressionSummary, + s.getCompatibleNotes().nonEmpty || s.getIncompatibleReasons().nonEmpty || + s.getUnsupportedReasons().nonEmpty) + } + } + notesAndSummary.map { case (summary, hasCompat) => + // scalastyle:off caselocale + val anchor = cls.getSimpleName.toLowerCase + // scalastyle:on caselocale + SerdeDocInfo( + summary = summary, + hasCompatContent = hasCompat, + category = classToCategory.get(cls), + anchor = anchor) + } + } + } + + /** + * Resolve all rows for a group, logging warnings for unclassified builtins. + * + * Consumed by `generateExpressionReference` (added in the following task). + */ + private def rowsForGroup(group: String, entries: Seq[FunctionEntry]): Seq[ReferenceRow] = { + entries.filter(_.group == group).map { e => + val (row, warn) = + resolveRow(e, serdeDocInfoFor(e.className), plannedExpressions.get(e.name)) + // scalastyle:off println + warn.foreach(w => println(s"[GenerateDocs][WARN] $w")) + // scalastyle:on println + row + } + } + def main(args: Array[String]): Unit = { val userGuideLocation = args(0) generateConfigReference(s"$userGuideLocation/configs.md") @@ -148,6 +700,24 @@ object GenerateDocs { for ((category, (page, notesFn)) <- categoryPages) { generateExpressionCompatNotes(s"$userGuideLocation/$page", category, notesFn()) } + generateExpressionReference(s"$userGuideLocation/expressions.md") + } + + private def generateExpressionReference(filename: String): Unit = { + val entries = builtinFunctions() + val pattern = "".r + val lines = readFile(filename) + val w = new BufferedOutputStream(new FileOutputStream(filename)) + for (line <- lines) { + w.write(s"${line.stripTrailing()}\n".getBytes) + line.trim match { + case pattern(group) => + val table = ExpressionReference.renderTable(rowsForGroup(group, entries)) + w.write(s"$table\n".getBytes) + case _ => + } + } + w.close() } private def generateConfigReference(filename: String): Unit = { diff --git a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala index 9a83152168..00a5237742 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala @@ -71,6 +71,15 @@ trait CometAggregateExpressionSerde[T <: AggregateFunction] { */ def getUnsupportedReasons(): Seq[String] = Seq.empty + /** + * Get a short, at-a-glance summary note for the expression-reference table (`expressions.md`). + * Keep it to a single clause, e.g. "Interval types fall back" or "via `CometExplodeExec`". + * Detailed behavior belongs in the Compatibility Guide, which is generated from + * getCompatibleNotes/getIncompatibleReasons/getUnsupportedReasons; the reference table only + * links there. Returns None by default. + */ + def getExpressionSummary: Option[String] = None + /** * Determine the support level of the expression based on its attributes. * diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index e015de879a..14313c3658 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -69,6 +69,15 @@ trait CometExpressionSerde[T <: Expression] { */ def getUnsupportedReasons(): Seq[String] = Seq.empty + /** + * Get a short, at-a-glance summary note for the expression-reference table (`expressions.md`). + * Keep it to a single clause, e.g. "Interval types fall back" or "via `CometExplodeExec`". + * Detailed behavior belongs in the Compatibility Guide, which is generated from + * getCompatibleNotes/getIncompatibleReasons/getUnsupportedReasons; the reference table only + * links there. Returns None by default. + */ + def getExpressionSummary: Option[String] = None + /** * Determine the support level of the expression based on its attributes. * diff --git a/spark/src/test/scala/org/apache/comet/ExpressionReferenceSuite.scala b/spark/src/test/scala/org/apache/comet/ExpressionReferenceSuite.scala new file mode 100644 index 0000000000..50e277419f --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/ExpressionReferenceSuite.scala @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.comet.ExpressionReference._ + +class ExpressionReferenceSuite extends AnyFunSuite { + + test("status symbols") { + assert(Supported.symbol == "✅") + assert(Planned.symbol == "🔜") + assert(NotPlanned.symbol == "💤") + assert(Unclassified.symbol == "🔜") + } + + test("real statuses have distinct symbols") { + assert(Set(Supported.symbol, Planned.symbol, NotPlanned.symbol).size == 3) + } + + test("PlannedExpr rejects Unclassified status") { + assertThrows[IllegalArgumentException](PlannedExpr(Unclassified)) + } + + test("PlannedExpr allows Supported for non-serde paths") { + PlannedExpr(Supported, note = Some("via rewrite")) + } + + private val arrayAppend = FunctionEntry("array_append", "array_funcs", "x.ArrayAppend") + private val arrayExcept = FunctionEntry("array_except", "array_funcs", "x.ArrayExcept") + private val kurtosis = FunctionEntry("kurtosis", "agg_funcs", "x.Kurtosis") + private val newThing = FunctionEntry("new_thing", "math_funcs", "x.NewThing") + + test("serde-backed with no compat content -> Supported, summary only") { + val info = SerdeDocInfo( + Some("Interval types fall back"), + hasCompatContent = false, + None, + "arrayappend") + val (row, warn) = resolveRow(arrayAppend, Some(info), None) + assert(row.status == Supported) + assert(row.notes == "Interval types fall back") + assert(warn.isEmpty) + } + + test("serde-backed with compat content -> Supported, summary + link") { + val info = SerdeDocInfo( + Some("Falls back by default"), + hasCompatContent = true, + Some("array"), + "arrayexcept") + val (row, _) = resolveRow(arrayExcept, Some(info), None) + assert( + row.notes == + "Falls back by default [details](compatibility/expressions/array.md#arrayexcept)") + } + + test("serde-backed compat content but category has no compat page -> no link") { + val info = SerdeDocInfo(None, hasCompatContent = true, None, "x") + val (row, _) = resolveRow(arrayAppend, Some(info), None) + assert(row.notes == "") + } + + test("planned with issue -> Planned, issue link") { + val (row, warn) = resolveRow(kurtosis, None, Some(PlannedExpr(Planned, issue = Some(4098)))) + assert(row.status == Planned) + assert(row.notes == "[#4098](https://github.com/apache/datafusion-comet/issues/4098)") + assert(warn.isEmpty) + } + + test("not planned with note -> NotPlanned") { + val (row, _) = resolveRow(kurtosis, None, Some(PlannedExpr(NotPlanned, note = Some("Niche")))) + assert(row.status == NotPlanned) + assert(row.notes == "Niche") + } + + test("serde-backed compat content, category present, no summary -> bare link") { + val info = SerdeDocInfo(None, hasCompatContent = true, Some("array"), "arrayexcept") + val (row, _) = resolveRow(arrayExcept, Some(info), None) + assert(row.status == Supported) + assert(row.notes == "[details](compatibility/expressions/array.md#arrayexcept)") + } + + test("unclassified -> placeholder row and warning") { + val (row, warn) = resolveRow(newThing, None, None) + assert(row.status == Unclassified) + assert(row.notes == "unclassified; not yet reviewed") + assert(warn.exists(_.contains("new_thing"))) + assert(warn.exists(_.contains("math_funcs"))) + } + + test("renderRow uses backticked name, status symbol, single-space padding") { + assert(renderRow(ReferenceRow("any", Supported, "")) == "| `any` | ✅ | |") + assert( + renderRow(ReferenceRow("kurtosis", Planned, "[#4098](u)")) == + "| `kurtosis` | 🔜 | [#4098](u) |") + } + + test("renderTable sorts rows by name and emits header") { + val rows = Seq(ReferenceRow("b", Supported, ""), ReferenceRow("a", Planned, "x")) + val expected = + """|| Function | Status | Notes | + || --- | --- | --- | + || `a` | 🔜 | x | + || `b` | ✅ | |""".stripMargin + assert(renderTable(rows) == expected) + } +} diff --git a/spark/src/test/scala/org/apache/comet/FunctionRegistryEnumerationSuite.scala b/spark/src/test/scala/org/apache/comet/FunctionRegistryEnumerationSuite.scala new file mode 100644 index 0000000000..d1a143e94e --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/FunctionRegistryEnumerationSuite.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import org.scalatest.funsuite.AnyFunSuite + +class FunctionRegistryEnumerationSuite extends AnyFunSuite { + + test("enumerates known builtins with group and class") { + val entries = ExpressionReference.builtinFunctions() + // Sanity: Spark registers hundreds of builtins; a near-empty result means a + // classpath/registry problem, so fail clearly rather than via a missing-key lookup. + assert(entries.size > 100) + + val byName = entries.map(e => e.name -> e).toMap + + val append = byName("array_append") + assert(append.group == "array_funcs") + // The concrete backing class varies across Spark versions (e.g. array_append is + // RuntimeReplaceable in Spark 4.0), so only assert the class name is populated. + assert(append.className.nonEmpty) + + // symbolic operator entries are present too + assert(byName.contains("+")) + assert(byName("+").group == "math_funcs") + } +}