mirror of https://github.com/apache/superset.git
fix: eliminate cartesian product columns in pivot operator (#15975)
* fix: eliminate cartesian product columns in pivot operator * wip * wip * minor tip
This commit is contained in:
parent
b73d7baedf
commit
c01d42fd98
|
@ -264,6 +264,15 @@ def pivot( # pylint: disable=too-many-arguments
|
|||
# Remove once/if support is added.
|
||||
aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}
|
||||
|
||||
# When dropna = False, the pivot_table function will calculate cartesian-product
|
||||
# for MultiIndex.
|
||||
# https://github.com/apache/superset/issues/15956
|
||||
# https://github.com/pandas-dev/pandas/issues/18030
|
||||
series_set = set()
|
||||
if not drop_missing_columns and columns:
|
||||
for row in df[columns].itertuples():
|
||||
metrics_and_series = tuple(aggfunc.keys()) + tuple(row[1:])
|
||||
series_set.add(str(metrics_and_series))
|
||||
df = df.pivot_table(
|
||||
values=aggfunc.keys(),
|
||||
index=index,
|
||||
|
@ -275,6 +284,12 @@ def pivot( # pylint: disable=too-many-arguments
|
|||
margins_name=marginal_distribution_name,
|
||||
)
|
||||
|
||||
if not drop_missing_columns and len(series_set) > 0 and not df.empty:
|
||||
for col in df.columns:
|
||||
series = str(col)
|
||||
if series not in series_set:
|
||||
df = df.drop(col, axis=PandasAxis.COLUMN)
|
||||
|
||||
if combine_value_with_metric:
|
||||
df = df.stack(0).unstack()
|
||||
|
||||
|
|
|
@ -20,7 +20,8 @@ from importlib.util import find_spec
|
|||
import math
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from pandas import DataFrame, Series, Timestamp
|
||||
import numpy as np
|
||||
from pandas import DataFrame, Series, Timestamp, to_datetime
|
||||
import pytest
|
||||
|
||||
from superset.exceptions import QueryObjectValidationError
|
||||
|
@ -256,6 +257,26 @@ class TestPostProcessing(SupersetTestCase):
|
|||
aggregates={"idx_nulls": {}},
|
||||
)
|
||||
|
||||
def test_pivot_eliminate_cartesian_product_columns(self):
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
||||
"a": [0, 1],
|
||||
"b": [0, 1],
|
||||
"metric": [9, np.NAN],
|
||||
}
|
||||
)
|
||||
|
||||
df = proc.pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["a", "b"],
|
||||
aggregates={"metric": {"operator": "mean"}},
|
||||
drop_missing_columns=False,
|
||||
)
|
||||
self.assertEqual(list(df.columns), ["dttm", "0, 0", "1, 1"])
|
||||
self.assertTrue(np.isnan(df["1, 1"][0]))
|
||||
|
||||
def test_aggregate(self):
|
||||
aggregates = {
|
||||
"asc sum": {"column": "asc_idx", "operator": "sum"},
|
||||
|
|
Loading…
Reference in New Issue