fix(chart-data-api): support numeric temporal columns (#13138)

This commit is contained in:
Ville Brofeldt 2021-02-16 09:51:22 +02:00 committed by GitHub
parent c9f76d58f0
commit d8c32b8097
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 73 additions and 39 deletions

View File

@ -17,7 +17,6 @@
import copy
import logging
import math
from datetime import timedelta
from typing import Any, cast, ClassVar, Dict, List, Optional, Union
import numpy as np
@ -112,17 +111,12 @@ class QueryContext:
# If the datetime format is unix, the parse will use the corresponding
# parsing logic
if not df.empty:
if DTTM_ALIAS in df.columns:
if timestamp_format in ("epoch_s", "epoch_ms"):
# Column has already been formatted as a timestamp.
df[DTTM_ALIAS] = df[DTTM_ALIAS].apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if self.datasource.offset:
df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset)
df[DTTM_ALIAS] += query_object.time_shift
df = utils.normalize_dttm_col(
df=df,
timestamp_format=timestamp_format,
offset=self.datasource.offset,
time_shift=query_object.time_shift,
)
if self.enforce_numerical_metrics:
self.df_metrics_to_num(df, query_object)

View File

@ -76,6 +76,7 @@ from flask_appbuilder.security.sqla.models import Role, User
from flask_babel import gettext as __
from flask_babel.speaklater import LazyString
from pandas.api.types import infer_dtype
from pandas.core.dtypes.common import is_numeric_dtype
from sqlalchemy import event, exc, select, Text
from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlalchemy.engine import Connection, Engine
@ -1579,3 +1580,34 @@ def format_list(items: Sequence[str], sep: str = ", ", quote: str = '"') -> str:
def find_duplicates(items: Iterable[InputType]) -> List[InputType]:
"""Find duplicate items in an iterable."""
return [item for item, count in collections.Counter(items).items() if count > 1]
def normalize_dttm_col(
df: pd.DataFrame,
timestamp_format: Optional[str],
offset: int,
time_shift: Optional[timedelta],
) -> pd.DataFrame:
if DTTM_ALIAS not in df.columns:
return df
df = df.copy()
if timestamp_format in ("epoch_s", "epoch_ms"):
dttm_col = df[DTTM_ALIAS]
if is_numeric_dtype(dttm_col):
# Column is formatted as a numeric value
unit = timestamp_format.replace("epoch_", "")
df[DTTM_ALIAS] = pd.to_datetime(
dttm_col, utc=False, unit=unit, origin="unix"
)
else:
# Column has already been formatted as a timestamp.
df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if offset:
df[DTTM_ALIAS] += timedelta(hours=offset)
if time_shift is not None:
df[DTTM_ALIAS] += time_shift
return df

View File

@ -284,33 +284,12 @@ class BaseViz:
# If the datetime format is unix, the parse will use the corresponding
# parsing logic.
if not df.empty:
if DTTM_ALIAS in df.columns:
if timestamp_format in ("epoch_s", "epoch_ms"):
# Column has already been formatted as a timestamp.
dttm_col = df[DTTM_ALIAS]
one_ts_val = dttm_col[0]
# convert time column to pandas Timestamp, but different
# ways to convert depending on string or int types
try:
int(one_ts_val)
is_integral = True
except (ValueError, TypeError):
is_integral = False
if is_integral:
unit = "s" if timestamp_format == "epoch_s" else "ms"
df[DTTM_ALIAS] = pd.to_datetime(
dttm_col, utc=False, unit=unit, origin="unix"
)
else:
df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if self.datasource.offset:
df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset)
df[DTTM_ALIAS] += self.time_shift
df = utils.normalize_dttm_col(
df=df,
timestamp_format=timestamp_format,
offset=self.datasource.offset,
time_shift=self.time_shift,
)
if self.enforce_numerical_metrics:
self.df_metrics_to_num(df)

View File

@ -45,6 +45,7 @@ from superset.utils.core import (
cast_to_num,
convert_legacy_filters_into_adhoc,
create_ssl_cert_file,
DTTM_ALIAS,
format_timedelta,
GenericDataType,
get_form_data_token,
@ -59,6 +60,7 @@ from superset.utils.core import (
merge_extra_filters,
merge_extra_form_data,
merge_request_params,
normalize_dttm_col,
parse_ssl_cert,
parse_js_uri_path_item,
extract_dataframe_dtypes,
@ -1131,3 +1133,30 @@ class TestUtils(SupersetTestCase):
df = pd.DataFrame(data={col[0]: col[2] for col in cols})
assert extract_dataframe_dtypes(df) == [col[1] for col in cols]
def test_normalize_dttm_col(self):
ts = pd.Timestamp(2021, 2, 15, 19, 0, 0, 0)
df = pd.DataFrame([{"__timestamp": ts, "a": 1}])
# test regular (non-numeric) format
assert normalize_dttm_col(df, None, 0, None)[DTTM_ALIAS][0] == ts
assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts
# test offset
assert normalize_dttm_col(df, None, 1, None)[DTTM_ALIAS][0] == pd.Timestamp(
2021, 2, 15, 20, 0, 0, 0
)
# test offset and timedelta
assert normalize_dttm_col(df, None, 1, timedelta(minutes=30))[DTTM_ALIAS][
0
] == pd.Timestamp(2021, 2, 15, 20, 30, 0, 0)
# test numeric epoch_s format
df = pd.DataFrame([{"__timestamp": ts.timestamp(), "a": 1}])
assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts
# test numeric epoch_ms format
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts