From d8c32b809730d16329713b486063e35557222844 Mon Sep 17 00:00:00 2001 From: Ville Brofeldt <33317356+villebro@users.noreply.github.com> Date: Tue, 16 Feb 2021 09:51:22 +0200 Subject: [PATCH] fix(chart-data-api): support numeric temporal columns (#13138) --- superset/common/query_context.py | 18 ++++++----------- superset/utils/core.py | 32 +++++++++++++++++++++++++++++++ superset/viz.py | 33 ++++++-------------------------- tests/utils_tests.py | 29 ++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 39 deletions(-) diff --git a/superset/common/query_context.py b/superset/common/query_context.py index 01bc17056b..be1e7b7ca0 100644 --- a/superset/common/query_context.py +++ b/superset/common/query_context.py @@ -17,7 +17,6 @@ import copy import logging import math -from datetime import timedelta from typing import Any, cast, ClassVar, Dict, List, Optional, Union import numpy as np @@ -112,17 +111,12 @@ class QueryContext: # If the datetime format is unix, the parse will use the corresponding # parsing logic if not df.empty: - if DTTM_ALIAS in df.columns: - if timestamp_format in ("epoch_s", "epoch_ms"): - # Column has already been formatted as a timestamp. - df[DTTM_ALIAS] = df[DTTM_ALIAS].apply(pd.Timestamp) - else: - df[DTTM_ALIAS] = pd.to_datetime( - df[DTTM_ALIAS], utc=False, format=timestamp_format - ) - if self.datasource.offset: - df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset) - df[DTTM_ALIAS] += query_object.time_shift + df = utils.normalize_dttm_col( + df=df, + timestamp_format=timestamp_format, + offset=self.datasource.offset, + time_shift=query_object.time_shift, + ) if self.enforce_numerical_metrics: self.df_metrics_to_num(df, query_object) diff --git a/superset/utils/core.py b/superset/utils/core.py index cf90963cd1..4ff3146cdc 100644 --- a/superset/utils/core.py +++ b/superset/utils/core.py @@ -76,6 +76,7 @@ from flask_appbuilder.security.sqla.models import Role, User from flask_babel import gettext as __ from flask_babel.speaklater import LazyString from pandas.api.types import infer_dtype +from pandas.core.dtypes.common import is_numeric_dtype from sqlalchemy import event, exc, select, Text from sqlalchemy.dialects.mysql import MEDIUMTEXT from sqlalchemy.engine import Connection, Engine @@ -1579,3 +1580,34 @@ def format_list(items: Sequence[str], sep: str = ", ", quote: str = '"') -> str: def find_duplicates(items: Iterable[InputType]) -> List[InputType]: """Find duplicate items in an iterable.""" return [item for item, count in collections.Counter(items).items() if count > 1] + + +def normalize_dttm_col( + df: pd.DataFrame, + timestamp_format: Optional[str], + offset: int, + time_shift: Optional[timedelta], +) -> pd.DataFrame: + if DTTM_ALIAS not in df.columns: + return df + df = df.copy() + if timestamp_format in ("epoch_s", "epoch_ms"): + dttm_col = df[DTTM_ALIAS] + if is_numeric_dtype(dttm_col): + # Column is formatted as a numeric value + unit = timestamp_format.replace("epoch_", "") + df[DTTM_ALIAS] = pd.to_datetime( + dttm_col, utc=False, unit=unit, origin="unix" + ) + else: + # Column has already been formatted as a timestamp. + df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp) + else: + df[DTTM_ALIAS] = pd.to_datetime( + df[DTTM_ALIAS], utc=False, format=timestamp_format + ) + if offset: + df[DTTM_ALIAS] += timedelta(hours=offset) + if time_shift is not None: + df[DTTM_ALIAS] += time_shift + return df diff --git a/superset/viz.py b/superset/viz.py index 33ab550511..db88383115 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -284,33 +284,12 @@ class BaseViz: # If the datetime format is unix, the parse will use the corresponding # parsing logic. if not df.empty: - if DTTM_ALIAS in df.columns: - if timestamp_format in ("epoch_s", "epoch_ms"): - # Column has already been formatted as a timestamp. - dttm_col = df[DTTM_ALIAS] - one_ts_val = dttm_col[0] - - # convert time column to pandas Timestamp, but different - # ways to convert depending on string or int types - try: - int(one_ts_val) - is_integral = True - except (ValueError, TypeError): - is_integral = False - if is_integral: - unit = "s" if timestamp_format == "epoch_s" else "ms" - df[DTTM_ALIAS] = pd.to_datetime( - dttm_col, utc=False, unit=unit, origin="unix" - ) - else: - df[DTTM_ALIAS] = dttm_col.apply(pd.Timestamp) - else: - df[DTTM_ALIAS] = pd.to_datetime( - df[DTTM_ALIAS], utc=False, format=timestamp_format - ) - if self.datasource.offset: - df[DTTM_ALIAS] += timedelta(hours=self.datasource.offset) - df[DTTM_ALIAS] += self.time_shift + df = utils.normalize_dttm_col( + df=df, + timestamp_format=timestamp_format, + offset=self.datasource.offset, + time_shift=self.time_shift, + ) if self.enforce_numerical_metrics: self.df_metrics_to_num(df) diff --git a/tests/utils_tests.py b/tests/utils_tests.py index 571590d579..9fed17251b 100644 --- a/tests/utils_tests.py +++ b/tests/utils_tests.py @@ -45,6 +45,7 @@ from superset.utils.core import ( cast_to_num, convert_legacy_filters_into_adhoc, create_ssl_cert_file, + DTTM_ALIAS, format_timedelta, GenericDataType, get_form_data_token, @@ -59,6 +60,7 @@ from superset.utils.core import ( merge_extra_filters, merge_extra_form_data, merge_request_params, + normalize_dttm_col, parse_ssl_cert, parse_js_uri_path_item, extract_dataframe_dtypes, @@ -1131,3 +1133,30 @@ class TestUtils(SupersetTestCase): df = pd.DataFrame(data={col[0]: col[2] for col in cols}) assert extract_dataframe_dtypes(df) == [col[1] for col in cols] + + def test_normalize_dttm_col(self): + ts = pd.Timestamp(2021, 2, 15, 19, 0, 0, 0) + df = pd.DataFrame([{"__timestamp": ts, "a": 1}]) + + # test regular (non-numeric) format + assert normalize_dttm_col(df, None, 0, None)[DTTM_ALIAS][0] == ts + assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts + assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts + + # test offset + assert normalize_dttm_col(df, None, 1, None)[DTTM_ALIAS][0] == pd.Timestamp( + 2021, 2, 15, 20, 0, 0, 0 + ) + + # test offset and timedelta + assert normalize_dttm_col(df, None, 1, timedelta(minutes=30))[DTTM_ALIAS][ + 0 + ] == pd.Timestamp(2021, 2, 15, 20, 30, 0, 0) + + # test numeric epoch_s format + df = pd.DataFrame([{"__timestamp": ts.timestamp(), "a": 1}]) + assert normalize_dttm_col(df, "epoch_s", 0, None)[DTTM_ALIAS][0] == ts + + # test numeric epoch_ms format + df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}]) + assert normalize_dttm_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts