superset/tests/unit_tests/pandas_postprocessing/test_compare.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
import pandas as pd

from superset.constants import PandasPostprocessingCompare as PPC
from superset.utils import pandas_postprocessing as pp
from superset.utils.pandas_postprocessing.utils import FLAT_COLUMN_SEPARATOR
from tests.unit_tests.fixtures.dataframes import multiple_metrics_df, timeseries_df2


def test_compare_should_not_side_effect():
    _timeseries_df2 = timeseries_df2.copy()
    pp.compare(
        df=_timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.DIFF,
    )
    assert _timeseries_df2.equals(timeseries_df2)


def test_compare_diff():
    # `difference` comparison
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.DIFF,
    )
    """
               label    y     z  difference__y__z
    2019-01-01     x  2.0   2.0               0.0
    2019-01-02     y  2.0   4.0              -2.0
    2019-01-05     z  2.0  10.0              -8.0
    2019-01-07     q  2.0   8.0              -6.0
    """
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "y": [2.0, 2.0, 2.0, 2.0],
                "z": [2.0, 4.0, 10.0, 8.0],
                "difference__y__z": [0.0, -2.0, -8.0, -6.0],
            },
        )
    )

    # drop original columns
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "difference__y__z": [0.0, -2.0, -8.0, -6.0],
            },
        )
    )


def test_compare_percentage():
    # `percentage` comparison
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.PCT,
    )
    """
               label    y     z  percentage__y__z
    2019-01-01     x  2.0   2.0              0.0
    2019-01-02     y  2.0   4.0              -0.50
    2019-01-05     z  2.0  10.0              -0.80
    2019-01-07     q  2.0   8.0              -0.75
    """
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "y": [2.0, 2.0, 2.0, 2.0],
                "z": [2.0, 4.0, 10.0, 8.0],
                "percentage__y__z": [0.0, -0.50, -0.80, -0.75],
            },
        )
    )


def test_compare_ratio():
    # `ratio` comparison
    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
        compare_type=PPC.RAT,
    )
    """
               label    y     z  ratio__y__z
    2019-01-01     x  2.0   2.0         1.00
    2019-01-02     y  2.0   4.0         0.50
    2019-01-05     z  2.0  10.0         0.20
    2019-01-07     q  2.0   8.0         0.25
    """
    assert post_df.equals(
        pd.DataFrame(
            index=timeseries_df2.index,
            data={
                "label": ["x", "y", "z", "q"],
                "y": [2.0, 2.0, 2.0, 2.0],
                "z": [2.0, 4.0, 10.0, 8.0],
                "ratio__y__z": [1.00, 0.50, 0.20, 0.25],
            },
        )
    )


def test_compare_multi_index_column():
    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
    index.name = "__timestamp"
    iterables = [["m1", "m2"], ["a", "b"], ["x", "y"]]
    columns = pd.MultiIndex.from_product(iterables, names=[None, "level1", "level2"])
    df = pd.DataFrame(index=index, columns=columns, data=1)
    """
                m1          m2
    level1       a     b     a     b
    level2       x  y  x  y  x  y  x  y
    __timestamp
    2021-01-01   1  1  1  1  1  1  1  1
    2021-01-02   1  1  1  1  1  1  1  1
    2021-01-03   1  1  1  1  1  1  1  1
    """
    post_df = pp.compare(
        df,
        source_columns=["m1"],
        compare_columns=["m2"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    flat_df = pp.flatten(post_df)
    """
      __timestamp  difference__m1__m2, a, x  difference__m1__m2, a, y  difference__m1__m2, b, x  difference__m1__m2, b, y
    0  2021-01-01                         0                         0                         0                         0
    1  2021-01-02                         0                         0                         0                         0
    2  2021-01-03                         0                         0                         0                         0
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "__timestamp": pd.to_datetime(
                    ["2021-01-01", "2021-01-02", "2021-01-03"]
                ),
                "difference__m1__m2, a, x": [0, 0, 0],
                "difference__m1__m2, a, y": [0, 0, 0],
                "difference__m1__m2, b, x": [0, 0, 0],
                "difference__m1__m2, b, y": [0, 0, 0],
            }
        )
    )


def test_compare_after_pivot():
    pivot_df = pp.pivot(
        df=multiple_metrics_df,
        index=["dttm"],
        columns=["country"],
        aggregates={
            "sum_metric": {"operator": "sum"},
            "count_metric": {"operator": "sum"},
        },
    )
    """
                   count_metric    sum_metric
    country              UK US         UK US
    dttm
    2019-01-01            1  2          5  6
    2019-01-02            3  4          7  8
    """
    compared_df = pp.compare(
        pivot_df,
        source_columns=["count_metric"],
        compare_columns=["sum_metric"],
        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
    """
               difference__count_metric__sum_metric
    country                                      UK US
    dttm
    2019-01-01                                   -4 -4
    2019-01-02                                   -4 -4
    """
    flat_df = pp.flatten(compared_df)
    """
            dttm  difference__count_metric__sum_metric, UK  difference__count_metric__sum_metric, US
    0 2019-01-01                                        -4                                        -4
    1 2019-01-02                                        -4                                        -4
    """
    assert flat_df.equals(
        pd.DataFrame(
            data={
                "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
                FLAT_COLUMN_SEPARATOR.join(
                    ["difference__count_metric__sum_metric", "UK"]
                ): [-4, -4],
                FLAT_COLUMN_SEPARATOR.join(
                    ["difference__count_metric__sum_metric", "US"]
                ): [-4, -4],
            }
        )
    )