# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import pandas as pd from superset.utils import pandas_postprocessing as pp from superset.utils.pandas_postprocessing.utils import FLAT_COLUMN_SEPARATOR from tests.unit_tests.fixtures.dataframes import timeseries_df def test_flat_should_not_change(): df = pd.DataFrame( data={ "foo": [1, 2, 3], "bar": [4, 5, 6], } ) assert pp.flatten(df).equals(df) def test_flat_should_not_reset_index(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]}) assert pp.flatten(df, reset_index=False).equals(df) def test_flat_should_flat_datetime_index(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]}) assert pp.flatten(df).equals( pd.DataFrame( { "__timestamp": index, "foo": [1, 2, 3], "bar": [4, 5, 6], } ) ) def test_flat_should_flat_multiple_index(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" iterables = [["foo", "bar"], [1, "two"]] columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"]) df = pd.DataFrame(index=index, columns=columns, data=1) assert pp.flatten(df).equals( pd.DataFrame( { "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["foo", "1"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["foo", "two"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["bar", "1"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["bar", "two"]): [1, 1, 1], } ) ) def test_flat_should_drop_index_level(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" columns = pd.MultiIndex.from_arrays( [["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]], names=["level1", "level2", "level3", "level4"], ) df = pd.DataFrame(index=index, columns=columns, data=1) # drop level by index assert pp.flatten( df.copy(), drop_levels=( 0, 1, ), ).equals( pd.DataFrame( { "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1], } ) ) # drop level by name assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals( pd.DataFrame( { "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1], } ) ) # only leave 1 level assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals( pd.DataFrame( { "__timestamp": index, FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1], FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1], } ) ) def test_flat_should_not_droplevel(): assert pp.flatten(timeseries_df, drop_levels=(0,)).equals( pd.DataFrame( { "index": pd.to_datetime( ["2019-01-01", "2019-01-02", "2019-01-05", "2019-01-07"] ), "label": ["x", "y", "z", "q"], "y": [1.0, 2.0, 3.0, 4.0], } ) ) def test_flat_integer_column_name(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" columns = pd.MultiIndex.from_arrays( [["a"] * 3, [100, 200, 300]], names=["level1", "level2"], ) df = pd.DataFrame(index=index, columns=columns, data=1) assert pp.flatten(df, drop_levels=(0,)).equals( pd.DataFrame( { "__timestamp": pd.to_datetime( ["2021-01-01", "2021-01-02", "2021-01-03"] ), "100": [1, 1, 1], "200": [1, 1, 1], "300": [1, 1, 1], } ) ) def test_escape_column_name(): index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) index.name = "__timestamp" columns = pd.MultiIndex.from_arrays( [ ["level1,value1", "level1,value2", "level1,value3"], ["level2, value1", "level2, value2", "level2, value3"], ], names=["level1", "level2"], ) df = pd.DataFrame(index=index, columns=columns, data=1) assert list(pp.flatten(df).columns.values) == [ "__timestamp", "level1\\,value1" + FLAT_COLUMN_SEPARATOR + "level2\\, value1", "level1\\,value2" + FLAT_COLUMN_SEPARATOR + "level2\\, value2", "level1\\,value3" + FLAT_COLUMN_SEPARATOR + "level2\\, value3", ]