fix(chart-data-api): support numeric temporal columns (#13138)

This commit is contained in:
Ville Brofeldt 2021-02-16 09:51:22 +02:00 committed by GitHub
parent c9f76d58f0
commit d8c32b8097
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 73 additions and 39 deletions

View File

@ -17,7 +17,6 @@
import copy import copy
import logging import logging
import math import math
from datetime import timedelta
from typing import Any, cast, ClassVar, Dict, List, Optional, Union from typing import Any, cast, ClassVar, Dict, List, Optional, Union
import numpy as np import numpy as np
@ -112,17 +111,12 @@ class QueryContext:
# If the datetime format is unix, the parse will use the corresponding # If the datetime format is unix, the parse will use the corresponding
# parsing logic # parsing logic
if not df.empty: if not df.empty:
if DTTM_ALIAS in df.columns: df = utils.normalize_dttm_col(
if timestamp_format in ("epoch_s", "epoch_ms"): df=df,
# Column has already been formatted as a timestamp. timestamp_format=timestamp_format,
df[DTTM_ALIAS] = df[DTTM_ALIAS].apply(pd.Timestamp) offset=self.datasource.offset,
else: time_shift=query_object.time_shift,
df[DTTM_ALIAS] = pd.to_datetime( )
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if self.datasource.offset:
df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset)
df[DTTM_ALIAS] += query_object.time_shift
if self.enforce_numerical_metrics: if self.enforce_numerical_metrics:
self.df_metrics_to_num(df, query_object) self.df_metrics_to_num(df, query_object)

View File

@ -76,6 +76,7 @@ from flask_appbuilder.security.sqla.models import Role, User
from flask_babel import gettext as __ from flask_babel import gettext as __
from flask_babel.speaklater import LazyString from flask_babel.speaklater import LazyString
from pandas.api.types import infer_dtype from pandas.api.types import infer_dtype
from pandas.core.dtypes.common import is_numeric_dtype
from sqlalchemy import event, exc, select, Text from sqlalchemy import event, exc, select, Text
from sqlalchemy.dialects.mysql import MEDIUMTEXT from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlalchemy.engine import Connection, Engine from sqlalchemy.engine import Connection, Engine
@ -1579,3 +1580,34 @@ def format_list(items: Sequence[str], sep: str = ", ", quote: str = '"') -> str:
def find_duplicates(items: Iterable[InputType]) -> List[InputType]: def find_duplicates(items: Iterable[InputType]) -> List[InputType]:
"""Find duplicate items in an iterable.""" """Find duplicate items in an iterable."""
return [item for item, count in collections.Counter(items).items() if count > 1] return [item for item, count in collections.Counter(items).items() if count > 1]
def normalize_dttm_col(
df: pd.DataFrame,
timestamp_format: Optional[str],
offset: int,
time_shift: Optional[timedelta],
) -> pd.DataFrame:
if DTTM_ALIAS not in df.columns:
return df
df = df.copy()
if timestamp_format in ("epoch_s", "epoch_ms"):
dttm_col = df[DTTM_ALIAS]
if is_numeric_dtype(dttm_col):
# Column is formatted as a numeric value
unit = timestamp_format.replace("epoch_", "")
df[DTTM_ALIAS] = pd.to_datetime(
dttm_col, utc=False, unit=unit, origin="unix"
)
else:
# Column has already been formatted as a timestamp.
df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if offset:
df[DTTM_ALIAS] += timedelta(hours=offset)
if time_shift is not None:
df[DTTM_ALIAS] += time_shift
return df

View File

@ -284,33 +284,12 @@ class BaseViz:
# If the datetime format is unix, the parse will use the corresponding # If the datetime format is unix, the parse will use the corresponding
# parsing logic. # parsing logic.
if not df.empty: if not df.empty:
if DTTM_ALIAS in df.columns: df = utils.normalize_dttm_col(
if timestamp_format in ("epoch_s", "epoch_ms"): df=df,
# Column has already been formatted as a timestamp. timestamp_format=timestamp_format,
dttm_col = df[DTTM_ALIAS] offset=self.datasource.offset,
one_ts_val = dttm_col[0] time_shift=self.time_shift,
)
# convert time column to pandas Timestamp, but different
# ways to convert depending on string or int types
try:
int(one_ts_val)
is_integral = True
except (ValueError, TypeError):
is_integral = False
if is_integral:
unit = "s" if timestamp_format == "epoch_s" else "ms"
df[DTTM_ALIAS] = pd.to_datetime(
dttm_col, utc=False, unit=unit, origin="unix"
)
else:
df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp)
else:
df[DTTM_ALIAS] = pd.to_datetime(
df[DTTM_ALIAS], utc=False, format=timestamp_format
)
if self.datasource.offset:
df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset)
df[DTTM_ALIAS] += self.time_shift
if self.enforce_numerical_metrics: if self.enforce_numerical_metrics:
self.df_metrics_to_num(df) self.df_metrics_to_num(df)

View File

@ -45,6 +45,7 @@ from superset.utils.core import (
cast_to_num, cast_to_num,
convert_legacy_filters_into_adhoc, convert_legacy_filters_into_adhoc,
create_ssl_cert_file, create_ssl_cert_file,
DTTM_ALIAS,
format_timedelta, format_timedelta,
GenericDataType, GenericDataType,
get_form_data_token, get_form_data_token,
@ -59,6 +60,7 @@ from superset.utils.core import (
merge_extra_filters, merge_extra_filters,
merge_extra_form_data, merge_extra_form_data,
merge_request_params, merge_request_params,
normalize_dttm_col,
parse_ssl_cert, parse_ssl_cert,
parse_js_uri_path_item, parse_js_uri_path_item,
extract_dataframe_dtypes, extract_dataframe_dtypes,
@ -1131,3 +1133,30 @@ class TestUtils(SupersetTestCase):
df = pd.DataFrame(data={col[0]: col[2] for col in cols}) df = pd.DataFrame(data={col[0]: col[2] for col in cols})
assert extract_dataframe_dtypes(df) == [col[1] for col in cols] assert extract_dataframe_dtypes(df) == [col[1] for col in cols]
def test_normalize_dttm_col(self):
ts = pd.Timestamp(2021, 2, 15, 19, 0, 0, 0)
df = pd.DataFrame([{"__timestamp": ts, "a": 1}])
# test regular (non-numeric) format
assert normalize_dttm_col(df, None, 0, None)[DTTM_ALIAS][0] == ts
assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts
# test offset
assert normalize_dttm_col(df, None, 1, None)[DTTM_ALIAS][0] == pd.Timestamp(
2021, 2, 15, 20, 0, 0, 0
)
# test offset and timedelta
assert normalize_dttm_col(df, None, 1, timedelta(minutes=30))[DTTM_ALIAS][
0
] == pd.Timestamp(2021, 2, 15, 20, 30, 0, 0)
# test numeric epoch_s format
df = pd.DataFrame([{"__timestamp": ts.timestamp(), "a": 1}])
assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts
# test numeric epoch_ms format
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts