2022-02-17 07:05:41 -05:00
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
# or more contributor license agreements. See the NOTICE file
|
|
|
|
# distributed with this work for additional information
|
|
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
|
|
# to you under the Apache License, Version 2.0 (the
|
|
|
|
# "License"); you may not use this file except in compliance
|
|
|
|
# with the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing,
|
|
|
|
# software distributed under the License is distributed on an
|
|
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
# KIND, either express or implied. See the License for the
|
|
|
|
# specific language governing permissions and limitations
|
|
|
|
# under the License.
|
2022-03-28 10:30:45 -04:00
|
|
|
import numpy as np
|
2022-03-23 01:46:28 -04:00
|
|
|
import pandas as pd
|
2022-02-17 07:05:41 -05:00
|
|
|
import pytest
|
2022-03-28 10:30:45 -04:00
|
|
|
from pandas import to_datetime
|
2022-02-17 07:05:41 -05:00
|
|
|
|
2022-03-23 01:46:28 -04:00
|
|
|
from superset.exceptions import InvalidPostProcessingError
|
|
|
|
from superset.utils import pandas_postprocessing as pp
|
|
|
|
from tests.unit_tests.fixtures.dataframes import categories_df, timeseries_df
|
2022-02-17 07:05:41 -05:00
|
|
|
|
|
|
|
|
2022-03-23 01:46:28 -04:00
|
|
|
def test_resample_should_not_side_effect():
|
|
|
|
_timeseries_df = timeseries_df.copy()
|
|
|
|
pp.resample(df=_timeseries_df, rule="1D", method="ffill")
|
|
|
|
assert _timeseries_df.equals(timeseries_df)
|
|
|
|
|
2022-02-17 07:05:41 -05:00
|
|
|
|
2022-03-23 01:46:28 -04:00
|
|
|
def test_resample():
|
|
|
|
post_df = pp.resample(df=timeseries_df, rule="1D", method="ffill")
|
|
|
|
"""
|
|
|
|
label y
|
|
|
|
2019-01-01 x 1.0
|
|
|
|
2019-01-02 y 2.0
|
|
|
|
2019-01-03 y 2.0
|
|
|
|
2019-01-04 y 2.0
|
|
|
|
2019-01-05 z 3.0
|
|
|
|
2019-01-06 z 3.0
|
|
|
|
2019-01-07 q 4.0
|
|
|
|
"""
|
|
|
|
assert post_df.equals(
|
|
|
|
pd.DataFrame(
|
|
|
|
index=pd.to_datetime(
|
|
|
|
[
|
|
|
|
"2019-01-01",
|
|
|
|
"2019-01-02",
|
|
|
|
"2019-01-03",
|
|
|
|
"2019-01-04",
|
|
|
|
"2019-01-05",
|
|
|
|
"2019-01-06",
|
|
|
|
"2019-01-07",
|
|
|
|
]
|
|
|
|
),
|
|
|
|
data={
|
|
|
|
"label": ["x", "y", "y", "y", "z", "z", "q"],
|
|
|
|
"y": [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0],
|
|
|
|
},
|
|
|
|
)
|
|
|
|
)
|
2022-02-17 07:05:41 -05:00
|
|
|
|
|
|
|
|
2022-03-23 01:46:28 -04:00
|
|
|
def test_resample_zero_fill():
|
|
|
|
post_df = pp.resample(df=timeseries_df, rule="1D", method="asfreq", fill_value=0)
|
|
|
|
assert post_df.equals(
|
|
|
|
pd.DataFrame(
|
|
|
|
index=pd.to_datetime(
|
|
|
|
[
|
|
|
|
"2019-01-01",
|
|
|
|
"2019-01-02",
|
|
|
|
"2019-01-03",
|
|
|
|
"2019-01-04",
|
|
|
|
"2019-01-05",
|
|
|
|
"2019-01-06",
|
|
|
|
"2019-01-07",
|
|
|
|
]
|
|
|
|
),
|
|
|
|
data={
|
|
|
|
"label": ["x", "y", 0, 0, "z", 0, "q"],
|
|
|
|
"y": [1.0, 2.0, 0, 0, 3.0, 0, 4.0],
|
|
|
|
},
|
|
|
|
)
|
2022-02-17 07:05:41 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-03-23 01:46:28 -04:00
|
|
|
def test_resample_after_pivot():
|
|
|
|
df = pd.DataFrame(
|
|
|
|
data={
|
|
|
|
"__timestamp": pd.to_datetime(
|
2022-02-17 07:05:41 -05:00
|
|
|
[
|
|
|
|
"2022-01-13",
|
|
|
|
"2022-01-13",
|
|
|
|
"2022-01-13",
|
|
|
|
"2022-01-11",
|
|
|
|
"2022-01-11",
|
|
|
|
"2022-01-11",
|
|
|
|
]
|
|
|
|
),
|
|
|
|
"city": ["Chicago", "LA", "NY", "Chicago", "LA", "NY"],
|
|
|
|
"val": [6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
|
|
|
|
}
|
|
|
|
)
|
2022-03-23 01:46:28 -04:00
|
|
|
pivot_df = pp.pivot(
|
2022-02-17 07:05:41 -05:00
|
|
|
df=df,
|
2022-03-23 01:46:28 -04:00
|
|
|
index=["__timestamp"],
|
|
|
|
columns=["city"],
|
2022-03-29 13:03:09 -04:00
|
|
|
aggregates={
|
|
|
|
"val": {"operator": "sum"},
|
|
|
|
},
|
2022-02-17 07:05:41 -05:00
|
|
|
)
|
2022-03-23 01:46:28 -04:00
|
|
|
"""
|
|
|
|
val
|
|
|
|
city Chicago LA NY
|
|
|
|
__timestamp
|
|
|
|
2022-01-11 3.0 2.0 1.0
|
|
|
|
2022-01-13 6.0 5.0 4.0
|
|
|
|
"""
|
2022-03-29 13:03:09 -04:00
|
|
|
resample_df = pp.resample(
|
|
|
|
df=pivot_df,
|
|
|
|
rule="1D",
|
|
|
|
method="asfreq",
|
|
|
|
fill_value=0,
|
|
|
|
)
|
2022-03-23 01:46:28 -04:00
|
|
|
"""
|
|
|
|
val
|
|
|
|
city Chicago LA NY
|
|
|
|
__timestamp
|
|
|
|
2022-01-11 3.0 2.0 1.0
|
|
|
|
2022-01-12 0.0 0.0 0.0
|
|
|
|
2022-01-13 6.0 5.0 4.0
|
|
|
|
"""
|
|
|
|
flat_df = pp.flatten(resample_df)
|
|
|
|
"""
|
|
|
|
__timestamp val, Chicago val, LA val, NY
|
|
|
|
0 2022-01-11 3.0 2.0 1.0
|
|
|
|
1 2022-01-12 0.0 0.0 0.0
|
|
|
|
2 2022-01-13 6.0 5.0 4.0
|
|
|
|
"""
|
|
|
|
assert flat_df.equals(
|
|
|
|
pd.DataFrame(
|
|
|
|
data={
|
|
|
|
"__timestamp": pd.to_datetime(
|
|
|
|
["2022-01-11", "2022-01-12", "2022-01-13"]
|
|
|
|
),
|
|
|
|
"val, Chicago": [3.0, 0, 6.0],
|
|
|
|
"val, LA": [2.0, 0, 5.0],
|
|
|
|
"val, NY": [1.0, 0, 4.0],
|
|
|
|
}
|
|
|
|
)
|
2022-02-17 07:05:41 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-03-23 01:46:28 -04:00
|
|
|
def test_resample_should_raise_ex():
|
|
|
|
with pytest.raises(InvalidPostProcessingError):
|
|
|
|
pp.resample(
|
2022-03-29 13:03:09 -04:00
|
|
|
df=categories_df,
|
|
|
|
rule="1D",
|
|
|
|
method="asfreq",
|
2022-02-17 07:05:41 -05:00
|
|
|
)
|
2022-03-28 10:30:45 -04:00
|
|
|
|
|
|
|
with pytest.raises(InvalidPostProcessingError):
|
|
|
|
pp.resample(
|
2022-03-29 13:03:09 -04:00
|
|
|
df=timeseries_df,
|
|
|
|
rule="1D",
|
|
|
|
method="foobar",
|
2022-03-28 10:30:45 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_resample_linear():
|
|
|
|
df = pd.DataFrame(
|
|
|
|
index=to_datetime(["2019-01-01", "2019-01-05", "2019-01-08"]),
|
|
|
|
data={"label": ["a", "e", "j"], "y": [1.0, 5.0, 8.0]},
|
|
|
|
)
|
|
|
|
post_df = pp.resample(df=df, rule="1D", method="linear")
|
|
|
|
"""
|
|
|
|
label y
|
|
|
|
2019-01-01 a 1.0
|
|
|
|
2019-01-02 NaN 2.0
|
|
|
|
2019-01-03 NaN 3.0
|
|
|
|
2019-01-04 NaN 4.0
|
|
|
|
2019-01-05 e 5.0
|
|
|
|
2019-01-06 NaN 6.0
|
|
|
|
2019-01-07 NaN 7.0
|
|
|
|
2019-01-08 j 8.0
|
|
|
|
"""
|
|
|
|
assert post_df.equals(
|
|
|
|
pd.DataFrame(
|
|
|
|
index=pd.to_datetime(
|
|
|
|
[
|
|
|
|
"2019-01-01",
|
|
|
|
"2019-01-02",
|
|
|
|
"2019-01-03",
|
|
|
|
"2019-01-04",
|
|
|
|
"2019-01-05",
|
|
|
|
"2019-01-06",
|
|
|
|
"2019-01-07",
|
|
|
|
"2019-01-08",
|
|
|
|
]
|
|
|
|
),
|
|
|
|
data={
|
|
|
|
"label": ["a", np.NaN, np.NaN, np.NaN, "e", np.NaN, np.NaN, "j"],
|
|
|
|
"y": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
|
|
|
|
},
|
|
|
|
)
|
|
|
|
)
|