mirror of https://github.com/apache/superset.git
Replace pandas.DataFrame with PyArrow.Table for nullable int typing (#8733)
* Use PyArrow Table for query result serialization * Cleanup dev comments * Additional cleanup * WIP: tests * Remove explicit dtype logic from db_engine_specs * Remove obsolete column property * SupersetTable column types * Port SupersetDataFrame methods to SupersetTable * Add test for nullable boolean columns * Support datetime values with timezone offsets * Black formatting * Pylint * More linting/formatting * Resolve issue with timezones not appearing in results * Types * Enable running of tests in tests/db_engine_specs * Resolve application context errors * Refactor and add tests for pyodbc.Row conversion * Appease isort, regardless of isort:skip * Re-enable RESULTS_BACKEND_USE_MSGPACK default based on benchmarks * Dataframe typing and nits * Renames to reduce ambiguity
This commit is contained in:
parent
4f8bf2b04d
commit
6537d5ed8c
|
@ -55,13 +55,11 @@ describe('ExploreResultsButton', () => {
|
|||
const mockColumns = {
|
||||
ds: {
|
||||
is_date: true,
|
||||
is_dim: false,
|
||||
name: 'ds',
|
||||
type: 'STRING',
|
||||
},
|
||||
gender: {
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: 'gender',
|
||||
type: 'STRING',
|
||||
},
|
||||
|
|
|
@ -219,13 +219,11 @@ export const queries = [
|
|||
columns: [
|
||||
{
|
||||
is_date: true,
|
||||
is_dim: false,
|
||||
name: 'ds',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: 'gender',
|
||||
type: 'STRING',
|
||||
},
|
||||
|
@ -233,13 +231,11 @@ export const queries = [
|
|||
selected_columns: [
|
||||
{
|
||||
is_date: true,
|
||||
is_dim: false,
|
||||
name: 'ds',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: 'gender',
|
||||
type: 'STRING',
|
||||
},
|
||||
|
@ -291,37 +287,31 @@ export const queryWithBadColumns = {
|
|||
selected_columns: [
|
||||
{
|
||||
is_date: true,
|
||||
is_dim: false,
|
||||
name: 'COUNT(*)',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: 'this_col_is_ok',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: 'a',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: '1',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: '123',
|
||||
type: 'STRING',
|
||||
},
|
||||
{
|
||||
is_date: false,
|
||||
is_dim: true,
|
||||
name: 'CASE WHEN 1=1 THEN 1 ELSE 0 END',
|
||||
type: 'STRING',
|
||||
},
|
||||
|
|
|
@ -519,7 +519,7 @@ RESULTS_BACKEND = None
|
|||
# rather than JSON. This feature requires additional testing from the
|
||||
# community before it is fully adopted, so this config option is provided
|
||||
# in order to disable should breaking issues be discovered.
|
||||
RESULTS_BACKEND_USE_MSGPACK = False
|
||||
RESULTS_BACKEND_USE_MSGPACK = True
|
||||
|
||||
# The S3 bucket where you want to store your external hive tables created
|
||||
# from CSV files. For example, 'companyname-superset'
|
||||
|
|
|
@ -14,257 +14,22 @@
|
|||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# pylint: disable=C,R,W
|
||||
""" Superset wrapper around pandas.DataFrame.
|
||||
|
||||
TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
|
||||
dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
|
||||
TODO(bkyryliuk): recognize integer encoded enums.
|
||||
|
||||
""" Superset utilities for pandas.DataFrame.
|
||||
"""
|
||||
import logging
|
||||
from datetime import date, datetime
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.core.common import maybe_box_datetimelike
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
|
||||
from superset.utils.core import JS_MAX_INTEGER
|
||||
|
||||
INFER_COL_TYPES_THRESHOLD = 95
|
||||
INFER_COL_TYPES_SAMPLE_SIZE = 100
|
||||
|
||||
|
||||
def dedup(l, suffix="__", case_sensitive=True):
|
||||
"""De-duplicates a list of string by suffixing a counter
|
||||
|
||||
Always returns the same number of entries as provided, and always returns
|
||||
unique values. Case sensitive comparison by default.
|
||||
|
||||
>>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'])))
|
||||
foo,bar,bar__1,bar__2,Bar
|
||||
>>> print(
|
||||
','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False))
|
||||
)
|
||||
foo,bar,bar__1,bar__2,Bar__3
|
||||
"""
|
||||
new_l = []
|
||||
seen = {}
|
||||
for s in l:
|
||||
s_fixed_case = s if case_sensitive else s.lower()
|
||||
if s_fixed_case in seen:
|
||||
seen[s_fixed_case] += 1
|
||||
s += suffix + str(seen[s_fixed_case])
|
||||
else:
|
||||
seen[s_fixed_case] = 0
|
||||
new_l.append(s)
|
||||
return new_l
|
||||
|
||||
|
||||
def is_numeric(dtype):
|
||||
if hasattr(dtype, "_is_numeric"):
|
||||
return dtype._is_numeric
|
||||
return np.issubdtype(dtype, np.number)
|
||||
|
||||
|
||||
class SupersetDataFrame:
|
||||
# Mapping numpy dtype.char to generic database types
|
||||
type_map = {
|
||||
"b": "BOOL", # boolean
|
||||
"i": "INT", # (signed) integer
|
||||
"u": "INT", # unsigned integer
|
||||
"l": "INT", # 64bit integer
|
||||
"f": "FLOAT", # floating-point
|
||||
"c": "FLOAT", # complex-floating point
|
||||
"m": None, # timedelta
|
||||
"M": "DATETIME", # datetime
|
||||
"O": "OBJECT", # (Python) objects
|
||||
"S": "BYTE", # (byte-)string
|
||||
"U": "STRING", # Unicode
|
||||
"V": None, # raw data (void)
|
||||
}
|
||||
|
||||
def __init__(self, data, cursor_description, db_engine_spec):
|
||||
data = data or []
|
||||
|
||||
column_names = []
|
||||
dtype = None
|
||||
if cursor_description:
|
||||
# get deduped list of column names
|
||||
column_names = dedup([col[0] for col in cursor_description])
|
||||
|
||||
# fix cursor descriptor with the deduped names
|
||||
cursor_description = [
|
||||
tuple([column_name, *list(description)[1:]])
|
||||
for column_name, description in zip(column_names, cursor_description)
|
||||
]
|
||||
|
||||
# get type for better type casting, if possible
|
||||
dtype = db_engine_spec.get_pandas_dtype(cursor_description)
|
||||
|
||||
self.column_names = column_names
|
||||
|
||||
if dtype:
|
||||
# put data in a 2D array so we can efficiently access each column;
|
||||
# the reshape ensures the shape is 2D in case data is empty
|
||||
array = np.array(data, dtype="object").reshape(-1, len(column_names))
|
||||
# convert each column in data into a Series of the proper dtype; we
|
||||
# need to do this because we can not specify a mixed dtype when
|
||||
# instantiating the DataFrame, and this allows us to have different
|
||||
# dtypes for each column.
|
||||
data = {
|
||||
column: pd.Series(array[:, i], dtype=dtype[column])
|
||||
for i, column in enumerate(column_names)
|
||||
}
|
||||
self.df = pd.DataFrame(data, columns=column_names)
|
||||
else:
|
||||
self.df = pd.DataFrame(list(data), columns=column_names).infer_objects()
|
||||
|
||||
self._type_dict = {}
|
||||
try:
|
||||
# The driver may not be passing a cursor.description
|
||||
self._type_dict = {
|
||||
col: db_engine_spec.get_datatype(cursor_description[i][1])
|
||||
for i, col in enumerate(column_names)
|
||||
if cursor_description
|
||||
}
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
|
||||
@property
|
||||
def raw_df(self):
|
||||
return self.df
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return len(self.df.index)
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
return self.format_data(self.df)
|
||||
|
||||
@classmethod
|
||||
def format_data(cls, df):
|
||||
# work around for https://github.com/pandas-dev/pandas/issues/18372
|
||||
data = [
|
||||
dict(
|
||||
(k, maybe_box_datetimelike(v))
|
||||
for k, v in zip(df.columns, np.atleast_1d(row))
|
||||
)
|
||||
for row in df.values
|
||||
]
|
||||
def df_to_records(dframe: pd.DataFrame) -> List[Dict[str, Any]]:
|
||||
data: List[Dict[str, Any]] = dframe.to_dict(orient="records")
|
||||
# TODO: refactor this
|
||||
for d in data:
|
||||
for k, v in list(d.items()):
|
||||
# if an int is too big for Java Script to handle
|
||||
# if an int is too big for JavaScript to handle
|
||||
# convert it to a string
|
||||
if isinstance(v, int):
|
||||
if abs(v) > JS_MAX_INTEGER:
|
||||
if isinstance(v, int) and abs(v) > JS_MAX_INTEGER:
|
||||
d[k] = str(v)
|
||||
return data
|
||||
|
||||
@classmethod
|
||||
def db_type(cls, dtype):
|
||||
"""Given a numpy dtype, Returns a generic database type"""
|
||||
if isinstance(dtype, ExtensionDtype):
|
||||
return cls.type_map.get(dtype.kind)
|
||||
elif hasattr(dtype, "char"):
|
||||
return cls.type_map.get(dtype.char)
|
||||
|
||||
@classmethod
|
||||
def datetime_conversion_rate(cls, data_series):
|
||||
success = 0
|
||||
total = 0
|
||||
for value in data_series:
|
||||
total += 1
|
||||
try:
|
||||
pd.to_datetime(value)
|
||||
success += 1
|
||||
except Exception:
|
||||
continue
|
||||
return 100 * success / total
|
||||
|
||||
@staticmethod
|
||||
def is_date(np_dtype, db_type_str):
|
||||
def looks_daty(s):
|
||||
if isinstance(s, str):
|
||||
return any([s.lower().startswith(ss) for ss in ("time", "date")])
|
||||
return False
|
||||
|
||||
if looks_daty(db_type_str):
|
||||
return True
|
||||
if np_dtype and np_dtype.name and looks_daty(np_dtype.name):
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_dimension(cls, dtype, column_name):
|
||||
if cls.is_id(column_name):
|
||||
return False
|
||||
return dtype.name in ("object", "bool")
|
||||
|
||||
@classmethod
|
||||
def is_id(cls, column_name):
|
||||
return column_name.startswith("id") or column_name.endswith("id")
|
||||
|
||||
@classmethod
|
||||
def agg_func(cls, dtype, column_name):
|
||||
# consider checking for key substring too.
|
||||
if cls.is_id(column_name):
|
||||
return "count_distinct"
|
||||
if (
|
||||
hasattr(dtype, "type")
|
||||
and issubclass(dtype.type, np.generic)
|
||||
and is_numeric(dtype)
|
||||
):
|
||||
return "sum"
|
||||
return None
|
||||
|
||||
@property
|
||||
def columns(self):
|
||||
"""Provides metadata about columns for data visualization.
|
||||
|
||||
:return: dict, with the fields name, type, is_date, is_dim and agg.
|
||||
"""
|
||||
if self.df.empty:
|
||||
return None
|
||||
|
||||
columns = []
|
||||
sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.df.index))
|
||||
sample = self.df
|
||||
if sample_size:
|
||||
sample = self.df.sample(sample_size)
|
||||
for col in self.df.dtypes.keys():
|
||||
db_type_str = self._type_dict.get(col) or self.db_type(self.df.dtypes[col])
|
||||
column = {
|
||||
"name": col,
|
||||
"agg": self.agg_func(self.df.dtypes[col], col),
|
||||
"type": db_type_str,
|
||||
"is_date": self.is_date(self.df.dtypes[col], db_type_str),
|
||||
"is_dim": self.is_dimension(self.df.dtypes[col], col),
|
||||
}
|
||||
|
||||
if not db_type_str or db_type_str.upper() == "OBJECT":
|
||||
v = sample[col].iloc[0] if not sample[col].empty else None
|
||||
if isinstance(v, str):
|
||||
column["type"] = "STRING"
|
||||
elif isinstance(v, int):
|
||||
column["type"] = "INT"
|
||||
elif isinstance(v, float):
|
||||
column["type"] = "FLOAT"
|
||||
elif isinstance(v, (datetime, date)):
|
||||
column["type"] = "DATETIME"
|
||||
column["is_date"] = True
|
||||
column["is_dim"] = False
|
||||
# check if encoded datetime
|
||||
if (
|
||||
column["type"] == "STRING"
|
||||
and self.datetime_conversion_rate(sample[col])
|
||||
> INFER_COL_TYPES_THRESHOLD
|
||||
):
|
||||
column.update({"is_date": True, "is_dim": False, "agg": None})
|
||||
# 'agg' is optional attribute
|
||||
if not column["agg"]:
|
||||
column.pop("agg", None)
|
||||
columns.append(column)
|
||||
return columns
|
||||
|
|
|
@ -289,12 +289,6 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods
|
|||
return type_code.upper()
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_pandas_dtype(
|
||||
cls, cursor_description: List[tuple]
|
||||
) -> Optional[Dict[str, str]]:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def extra_table_metadata(
|
||||
cls, database, table_name: str, schema_name: str
|
||||
|
|
|
@ -24,20 +24,6 @@ from sqlalchemy import literal_column
|
|||
|
||||
from superset.db_engine_specs.base import BaseEngineSpec
|
||||
|
||||
pandas_dtype_map = {
|
||||
"STRING": "object",
|
||||
"BOOLEAN": "object", # to support nullable bool
|
||||
"INTEGER": "Int64",
|
||||
"FLOAT": "float64",
|
||||
"TIMESTAMP": "datetime64[ns]",
|
||||
"DATETIME": "datetime64[ns]",
|
||||
"DATE": "object",
|
||||
"BYTES": "object",
|
||||
"TIME": "object",
|
||||
"RECORD": "object",
|
||||
"NUMERIC": "object",
|
||||
}
|
||||
|
||||
|
||||
class BigQueryEngineSpec(BaseEngineSpec):
|
||||
"""Engine spec for Google's BigQuery
|
||||
|
@ -209,9 +195,3 @@ class BigQueryEngineSpec(BaseEngineSpec):
|
|||
if key in kwargs:
|
||||
gbq_kwargs[key] = kwargs[key]
|
||||
pandas_gbq.to_gbq(df, **gbq_kwargs)
|
||||
|
||||
@classmethod
|
||||
def get_pandas_dtype(cls, cursor_description: List[tuple]) -> Dict[str, str]:
|
||||
return {
|
||||
col[0]: pandas_dtype_map.get(col[1], "object") for col in cursor_description
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple, TYPE_CHECKING
|
||||
|
||||
from pytz import _FixedOffset # type: ignore
|
||||
from sqlalchemy.dialects.postgresql.base import PGInspector
|
||||
|
||||
from superset.db_engine_specs.base import BaseEngineSpec, LimitMethod
|
||||
|
@ -26,6 +27,12 @@ if TYPE_CHECKING:
|
|||
from superset.models.core import Database # pylint: disable=unused-import
|
||||
|
||||
|
||||
# Replace psycopg2.tz.FixedOffsetTimezone with pytz, which is serializable by PyArrow
|
||||
# https://github.com/stub42/pytz/blob/b70911542755aeeea7b5a9e066df5e1c87e8f2c8/src/pytz/reference.py#L25
|
||||
class FixedOffsetTimezone(_FixedOffset):
|
||||
pass
|
||||
|
||||
|
||||
class PostgresBaseEngineSpec(BaseEngineSpec):
|
||||
""" Abstract class for Postgres 'like' databases """
|
||||
|
||||
|
@ -45,6 +52,7 @@ class PostgresBaseEngineSpec(BaseEngineSpec):
|
|||
|
||||
@classmethod
|
||||
def fetch_data(cls, cursor, limit: int) -> List[Tuple]:
|
||||
cursor.tzinfo_factory = FixedOffsetTimezone
|
||||
if not cursor.description:
|
||||
return []
|
||||
if cls.limit_method == LimitMethod.FETCH_MANY:
|
||||
|
|
|
@ -46,21 +46,6 @@ if TYPE_CHECKING:
|
|||
QueryStatus = utils.QueryStatus
|
||||
config = app.config
|
||||
|
||||
# map between Presto types and Pandas
|
||||
pandas_dtype_map = {
|
||||
"boolean": "object", # to support nullable bool
|
||||
"tinyint": "Int64", # note: capital "I" means nullable int
|
||||
"smallint": "Int64",
|
||||
"integer": "Int64",
|
||||
"bigint": "Int64",
|
||||
"real": "float64",
|
||||
"double": "float64",
|
||||
"varchar": "object",
|
||||
"timestamp": "datetime64[ns]",
|
||||
"date": "datetime64[ns]",
|
||||
"varbinary": "object",
|
||||
}
|
||||
|
||||
|
||||
def get_children(column: Dict[str, str]) -> List[Dict[str, str]]:
|
||||
"""
|
||||
|
@ -962,9 +947,3 @@ class PrestoEngineSpec(BaseEngineSpec):
|
|||
if df.empty:
|
||||
return ""
|
||||
return df.to_dict()[field_to_return][0]
|
||||
|
||||
@classmethod
|
||||
def get_pandas_dtype(cls, cursor_description: List[tuple]) -> Dict[str, str]:
|
||||
return {
|
||||
col[0]: pandas_dtype_map.get(col[1], "object") for col in cursor_description
|
||||
}
|
||||
|
|
|
@ -0,0 +1,177 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# pylint: disable=C,R,W
|
||||
""" Superset wrapper around pyarrow.Table.
|
||||
"""
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
|
||||
from superset import db_engine_specs
|
||||
|
||||
|
||||
def dedup(l: List[str], suffix: str = "__", case_sensitive: bool = True) -> List[str]:
|
||||
"""De-duplicates a list of string by suffixing a counter
|
||||
|
||||
Always returns the same number of entries as provided, and always returns
|
||||
unique values. Case sensitive comparison by default.
|
||||
|
||||
>>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'])))
|
||||
foo,bar,bar__1,bar__2,Bar
|
||||
>>> print(
|
||||
','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False))
|
||||
)
|
||||
foo,bar,bar__1,bar__2,Bar__3
|
||||
"""
|
||||
new_l: List[str] = []
|
||||
seen: Dict[str, int] = {}
|
||||
for s in l:
|
||||
s_fixed_case = s if case_sensitive else s.lower()
|
||||
if s_fixed_case in seen:
|
||||
seen[s_fixed_case] += 1
|
||||
s += suffix + str(seen[s_fixed_case])
|
||||
else:
|
||||
seen[s_fixed_case] = 0
|
||||
new_l.append(s)
|
||||
return new_l
|
||||
|
||||
|
||||
class SupersetResultSet:
|
||||
def __init__(
|
||||
self,
|
||||
data: List[Tuple[Any, ...]],
|
||||
cursor_description: Tuple[Any, ...],
|
||||
db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
|
||||
):
|
||||
data = data or []
|
||||
column_names: List[str] = []
|
||||
pa_data: List[pa.Array] = []
|
||||
deduped_cursor_desc: List[Tuple[Any, ...]] = []
|
||||
|
||||
if cursor_description:
|
||||
# get deduped list of column names
|
||||
column_names = dedup([col[0] for col in cursor_description])
|
||||
|
||||
# fix cursor descriptor with the deduped names
|
||||
deduped_cursor_desc = [
|
||||
tuple([column_name, *list(description)[1:]])
|
||||
for column_name, description in zip(column_names, cursor_description)
|
||||
]
|
||||
|
||||
# put data in a 2D array so we can efficiently access each column;
|
||||
array = np.array(data, dtype="object")
|
||||
if array.size > 0:
|
||||
pa_data = [pa.array(array[:, i]) for i, column in enumerate(column_names)]
|
||||
|
||||
# workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
|
||||
# related: https://issues.apache.org/jira/browse/ARROW-5248
|
||||
if pa_data:
|
||||
for i, column in enumerate(column_names):
|
||||
if pa.types.is_temporal(pa_data[i].type):
|
||||
sample = self.first_nonempty(array[:, i])
|
||||
if sample and isinstance(sample, datetime.datetime):
|
||||
try:
|
||||
if sample.tzinfo:
|
||||
series = pd.Series(array[:, i], dtype="datetime64[ns]")
|
||||
pa_data[i] = pa.Array.from_pandas(
|
||||
series, type=pa.timestamp("ns", tz=sample.tzinfo)
|
||||
)
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
|
||||
self.table = pa.Table.from_arrays(pa_data, names=column_names)
|
||||
self._type_dict: Dict[str, Any] = {}
|
||||
try:
|
||||
# The driver may not be passing a cursor.description
|
||||
self._type_dict = {
|
||||
col: db_engine_spec.get_datatype(deduped_cursor_desc[i][1])
|
||||
for i, col in enumerate(column_names)
|
||||
if deduped_cursor_desc
|
||||
}
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
|
||||
@staticmethod
|
||||
def convert_pa_dtype(pa_dtype: pa.DataType) -> Optional[str]:
|
||||
if pa.types.is_boolean(pa_dtype):
|
||||
return "BOOL"
|
||||
if pa.types.is_integer(pa_dtype):
|
||||
return "INT"
|
||||
if pa.types.is_floating(pa_dtype):
|
||||
return "FLOAT"
|
||||
if pa.types.is_string(pa_dtype):
|
||||
return "STRING"
|
||||
if pa.types.is_temporal(pa_dtype):
|
||||
return "DATETIME"
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def convert_table_to_df(table: pa.Table) -> pd.DataFrame:
|
||||
return table.to_pandas(integer_object_nulls=True)
|
||||
|
||||
@staticmethod
|
||||
def first_nonempty(items: List) -> Any:
|
||||
return next((i for i in items if i), None)
|
||||
|
||||
@staticmethod
|
||||
def is_date(db_type_str: Optional[str]) -> bool:
|
||||
return db_type_str in ("DATETIME", "TIMESTAMP")
|
||||
|
||||
def data_type(self, col_name: str, pa_dtype: pa.DataType) -> Optional[str]:
|
||||
"""Given a pyarrow data type, Returns a generic database type"""
|
||||
set_type = self._type_dict.get(col_name)
|
||||
if set_type:
|
||||
return set_type
|
||||
|
||||
mapped_type = self.convert_pa_dtype(pa_dtype)
|
||||
if mapped_type:
|
||||
return mapped_type
|
||||
|
||||
return None
|
||||
|
||||
def to_pandas_df(self) -> pd.DataFrame:
|
||||
return self.convert_table_to_df(self.table)
|
||||
|
||||
@property
|
||||
def pa_table(self) -> pa.Table:
|
||||
return self.table
|
||||
|
||||
@property
|
||||
def size(self) -> int:
|
||||
return self.table.num_rows
|
||||
|
||||
@property
|
||||
def columns(self) -> List[Dict[str, Any]]:
|
||||
if not self.table.column_names:
|
||||
return []
|
||||
|
||||
columns = []
|
||||
for col in self.table.schema:
|
||||
db_type_str = self.data_type(col.name, col.type)
|
||||
column = {
|
||||
"name": col.name,
|
||||
"type": db_type_str,
|
||||
"is_date": self.is_date(db_type_str),
|
||||
}
|
||||
columns.append(column)
|
||||
|
||||
return columns
|
|
@ -19,7 +19,7 @@ import uuid
|
|||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
from sys import getsizeof
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import backoff
|
||||
import msgpack
|
||||
|
@ -39,10 +39,11 @@ from superset import (
|
|||
results_backend_use_msgpack,
|
||||
security_manager,
|
||||
)
|
||||
from superset.dataframe import SupersetDataFrame
|
||||
from superset.dataframe import df_to_records
|
||||
from superset.db_engine_specs import BaseEngineSpec
|
||||
from superset.extensions import celery_app
|
||||
from superset.models.sql_lab import Query
|
||||
from superset.result_set import SupersetResultSet
|
||||
from superset.sql_parse import ParsedQuery
|
||||
from superset.utils.core import json_iso_dttm_ser, QueryStatus, sources, zlib_compress
|
||||
from superset.utils.dates import now_as_float
|
||||
|
@ -251,7 +252,7 @@ def execute_sql_statement(sql_statement, query, user_name, session, cursor, log_
|
|||
|
||||
logger.debug(f"Query {query.id}: Fetching cursor description")
|
||||
cursor_description = cursor.description
|
||||
return SupersetDataFrame(data, cursor_description, db_engine_spec)
|
||||
return SupersetResultSet(data, cursor_description, db_engine_spec)
|
||||
|
||||
|
||||
def _serialize_payload(
|
||||
|
@ -265,13 +266,13 @@ def _serialize_payload(
|
|||
|
||||
|
||||
def _serialize_and_expand_data(
|
||||
cdf: SupersetDataFrame,
|
||||
result_set: SupersetResultSet,
|
||||
db_engine_spec: BaseEngineSpec,
|
||||
use_msgpack: Optional[bool] = False,
|
||||
expand_data: bool = False,
|
||||
) -> Tuple[Union[bytes, str], list, list, list]:
|
||||
selected_columns: list = cdf.columns or []
|
||||
expanded_columns: list
|
||||
selected_columns: List[Dict] = result_set.columns
|
||||
expanded_columns: List[Dict]
|
||||
|
||||
if use_msgpack:
|
||||
with stats_timing(
|
||||
|
@ -279,14 +280,17 @@ def _serialize_and_expand_data(
|
|||
):
|
||||
data = (
|
||||
pa.default_serialization_context()
|
||||
.serialize(cdf.raw_df)
|
||||
.serialize(result_set.pa_table)
|
||||
.to_buffer()
|
||||
.to_pybytes()
|
||||
)
|
||||
|
||||
# expand when loading data from results backend
|
||||
all_columns, expanded_columns = (selected_columns, [])
|
||||
else:
|
||||
data = cdf.data or []
|
||||
df = result_set.to_pandas_df()
|
||||
data = df_to_records(df) or []
|
||||
|
||||
if expand_data:
|
||||
all_columns, data, expanded_columns = db_engine_spec.expand_data(
|
||||
selected_columns, data
|
||||
|
@ -356,7 +360,7 @@ def execute_sql_statements(
|
|||
query.set_extra_json_key("progress", msg)
|
||||
session.commit()
|
||||
try:
|
||||
cdf = execute_sql_statement(
|
||||
result_set = execute_sql_statement(
|
||||
statement, query, user_name, session, cursor, log_params
|
||||
)
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
|
@ -367,7 +371,7 @@ def execute_sql_statements(
|
|||
return payload
|
||||
|
||||
# Success, updating the query entry in database
|
||||
query.rows = cdf.size
|
||||
query.rows = result_set.size
|
||||
query.progress = 100
|
||||
query.set_extra_json_key("progress", None)
|
||||
if query.select_as_cta:
|
||||
|
@ -381,9 +385,13 @@ def execute_sql_statements(
|
|||
query.end_time = now_as_float()
|
||||
|
||||
data, selected_columns, all_columns, expanded_columns = _serialize_and_expand_data(
|
||||
cdf, db_engine_spec, store_results and results_backend_use_msgpack, expand_data
|
||||
result_set,
|
||||
db_engine_spec,
|
||||
store_results and results_backend_use_msgpack,
|
||||
expand_data,
|
||||
)
|
||||
|
||||
# TODO: data should be saved separately from metadata (likely in Parquet)
|
||||
payload.update(
|
||||
{
|
||||
"status": QueryStatus.SUCCESS,
|
||||
|
|
|
@ -61,6 +61,7 @@ from superset import (
|
|||
event_logger,
|
||||
get_feature_flags,
|
||||
is_feature_enabled,
|
||||
result_set,
|
||||
results_backend,
|
||||
results_backend_use_msgpack,
|
||||
security_manager,
|
||||
|
@ -227,10 +228,10 @@ def _deserialize_results_payload(
|
|||
ds_payload = msgpack.loads(payload, raw=False)
|
||||
|
||||
with stats_timing("sqllab.query.results_backend_pa_deserialize", stats_logger):
|
||||
df = pa.deserialize(ds_payload["data"])
|
||||
pa_table = pa.deserialize(ds_payload["data"])
|
||||
|
||||
# TODO: optimize this, perhaps via df.to_dict, then traversing
|
||||
ds_payload["data"] = dataframe.SupersetDataFrame.format_data(df) or []
|
||||
df = result_set.SupersetResultSet.convert_table_to_df(pa_table)
|
||||
ds_payload["data"] = dataframe.df_to_records(df) or []
|
||||
|
||||
db_engine_spec = query.database.db_engine_spec
|
||||
all_columns, data, expanded_columns = db_engine_spec.expand_data(
|
||||
|
|
|
@ -28,7 +28,7 @@ from flask import current_app
|
|||
|
||||
from tests.test_app import app
|
||||
from superset import db, sql_lab
|
||||
from superset.dataframe import SupersetDataFrame
|
||||
from superset.result_set import SupersetResultSet
|
||||
from superset.db_engine_specs.base import BaseEngineSpec
|
||||
from superset.extensions import celery_app
|
||||
from superset.models.helpers import QueryStatus
|
||||
|
@ -275,13 +275,13 @@ class CeleryTestCase(SupersetTestCase):
|
|||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
results = SupersetResultSet(data, cursor_descr, db_engine_spec)
|
||||
|
||||
with mock.patch.object(
|
||||
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
|
||||
) as expand_data:
|
||||
data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, False, True
|
||||
results, db_engine_spec, False, True
|
||||
)
|
||||
expand_data.assert_called_once()
|
||||
|
||||
|
@ -296,13 +296,13 @@ class CeleryTestCase(SupersetTestCase):
|
|||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
results = SupersetResultSet(data, cursor_descr, db_engine_spec)
|
||||
|
||||
with mock.patch.object(
|
||||
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
|
||||
) as expand_data:
|
||||
data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, True
|
||||
results, db_engine_spec, True
|
||||
)
|
||||
expand_data.assert_not_called()
|
||||
|
||||
|
@ -318,14 +318,14 @@ class CeleryTestCase(SupersetTestCase):
|
|||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
results = SupersetResultSet(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
results, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
|
@ -351,14 +351,14 @@ class CeleryTestCase(SupersetTestCase):
|
|||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
results = SupersetResultSet(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
results, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
|
|
|
@ -24,6 +24,7 @@ import io
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import pytz
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
|
@ -44,12 +45,12 @@ from superset.models.dashboard import Dashboard
|
|||
from superset.models.datasource_access_request import DatasourceAccessRequest
|
||||
from superset.models.slice import Slice
|
||||
from superset.models.sql_lab import Query
|
||||
from superset.result_set import SupersetResultSet
|
||||
from superset.utils import core as utils
|
||||
from superset.views import core as views
|
||||
from superset.views.database.views import DatabaseView
|
||||
|
||||
from .base_tests import SupersetTestCase
|
||||
from .fixtures.pyodbcRow import Row
|
||||
|
||||
|
||||
class CoreTests(SupersetTestCase):
|
||||
|
@ -702,18 +703,24 @@ class CoreTests(SupersetTestCase):
|
|||
os.remove(filename_2)
|
||||
|
||||
def test_dataframe_timezone(self):
|
||||
tz = psycopg2.tz.FixedOffsetTimezone(offset=60, name=None)
|
||||
tz = pytz.FixedOffset(60)
|
||||
data = [
|
||||
(datetime.datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=tz),),
|
||||
(datetime.datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=tz),),
|
||||
(datetime.datetime(2017, 11, 18, 22, 6, 30, tzinfo=tz),),
|
||||
]
|
||||
df = dataframe.SupersetDataFrame(list(data), [["data"]], BaseEngineSpec)
|
||||
data = df.data
|
||||
results = SupersetResultSet(list(data), [["data"]], BaseEngineSpec)
|
||||
df = results.to_pandas_df()
|
||||
data = dataframe.df_to_records(df)
|
||||
json_str = json.dumps(data, default=utils.pessimistic_json_iso_dttm_ser)
|
||||
self.assertDictEqual(
|
||||
data[0], {"data": pd.Timestamp("2017-11-18 21:53:00.219225+0100", tz=tz)}
|
||||
)
|
||||
self.assertDictEqual(
|
||||
data[1], {"data": pd.Timestamp("2017-11-18 22:06:30.061810+0100", tz=tz)}
|
||||
data[1], {"data": pd.Timestamp("2017-11-18 22:06:30+0100", tz=tz)}
|
||||
)
|
||||
self.assertEqual(
|
||||
json_str,
|
||||
'[{"data": "2017-11-18T21:53:00.219225+01:00"}, {"data": "2017-11-18T22:06:30+01:00"}]',
|
||||
)
|
||||
|
||||
def test_mssql_engine_spec_pymssql(self):
|
||||
|
@ -722,26 +729,11 @@ class CoreTests(SupersetTestCase):
|
|||
(1, 1, datetime.datetime(2017, 10, 19, 23, 39, 16, 660000)),
|
||||
(2, 2, datetime.datetime(2018, 10, 19, 23, 39, 16, 660000)),
|
||||
]
|
||||
df = dataframe.SupersetDataFrame(
|
||||
results = SupersetResultSet(
|
||||
list(data), [["col1"], ["col2"], ["col3"]], MssqlEngineSpec
|
||||
)
|
||||
data = df.data
|
||||
self.assertEqual(len(data), 2)
|
||||
self.assertEqual(
|
||||
data[0],
|
||||
{"col1": 1, "col2": 1, "col3": pd.Timestamp("2017-10-19 23:39:16.660000")},
|
||||
)
|
||||
|
||||
def test_mssql_engine_spec_odbc(self):
|
||||
# Test for case when pyodbc.Row is returned (msodbc driver)
|
||||
data = [
|
||||
Row((1, 1, datetime.datetime(2017, 10, 19, 23, 39, 16, 660000))),
|
||||
Row((2, 2, datetime.datetime(2018, 10, 19, 23, 39, 16, 660000))),
|
||||
]
|
||||
df = dataframe.SupersetDataFrame(
|
||||
list(data), [["col1"], ["col2"], ["col3"]], MssqlEngineSpec
|
||||
)
|
||||
data = df.data
|
||||
df = results.to_pandas_df()
|
||||
data = dataframe.df_to_records(df)
|
||||
self.assertEqual(len(data), 2)
|
||||
self.assertEqual(
|
||||
data[0],
|
||||
|
@ -876,14 +868,14 @@ class CoreTests(SupersetTestCase):
|
|||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = dataframe.SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
results = SupersetResultSet(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": utils.QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
results, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
|
@ -919,14 +911,14 @@ class CoreTests(SupersetTestCase):
|
|||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = dataframe.SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
results = SupersetResultSet(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": utils.QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
results, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
|
@ -953,7 +945,8 @@ class CoreTests(SupersetTestCase):
|
|||
deserialized_payload = views._deserialize_results_payload(
|
||||
serialized_payload, query_mock, use_new_deserialization
|
||||
)
|
||||
payload["data"] = dataframe.SupersetDataFrame.format_data(cdf.raw_df)
|
||||
df = results.to_pandas_df()
|
||||
payload["data"] = dataframe.df_to_records(df)
|
||||
|
||||
self.assertDictEqual(deserialized_payload, payload)
|
||||
expand_data.assert_called_once()
|
||||
|
|
|
@ -17,143 +17,35 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from superset.dataframe import dedup, SupersetDataFrame
|
||||
from superset.dataframe import df_to_records
|
||||
from superset.db_engine_specs import BaseEngineSpec
|
||||
from superset.db_engine_specs.presto import PrestoEngineSpec
|
||||
from superset.result_set import SupersetResultSet
|
||||
|
||||
from .base_tests import SupersetTestCase
|
||||
|
||||
|
||||
class SupersetDataFrameTestCase(SupersetTestCase):
|
||||
def test_dedup(self):
|
||||
self.assertEqual(dedup(["foo", "bar"]), ["foo", "bar"])
|
||||
self.assertEqual(
|
||||
dedup(["foo", "bar", "foo", "bar", "Foo"]),
|
||||
["foo", "bar", "foo__1", "bar__1", "Foo"],
|
||||
)
|
||||
self.assertEqual(
|
||||
dedup(["foo", "bar", "bar", "bar", "Bar"]),
|
||||
["foo", "bar", "bar__1", "bar__2", "Bar"],
|
||||
)
|
||||
self.assertEqual(
|
||||
dedup(["foo", "bar", "bar", "bar", "Bar"], case_sensitive=False),
|
||||
["foo", "bar", "bar__1", "bar__2", "Bar__3"],
|
||||
)
|
||||
|
||||
def test_get_columns_basic(self):
|
||||
def test_df_to_records(self):
|
||||
data = [("a1", "b1", "c1"), ("a2", "b2", "c2")]
|
||||
cursor_descr = (("a", "string"), ("b", "string"), ("c", "string"))
|
||||
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
df = results.to_pandas_df()
|
||||
|
||||
self.assertEqual(
|
||||
cdf.columns,
|
||||
df_to_records(df),
|
||||
[{"a": "a1", "b": "b1", "c": "c1"}, {"a": "a2", "b": "b2", "c": "c2"}],
|
||||
)
|
||||
|
||||
def test_js_max_int(self):
|
||||
data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")]
|
||||
cursor_descr = (("a", "int"), ("b", "int"), ("c", "string"))
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
df = results.to_pandas_df()
|
||||
|
||||
self.assertEqual(
|
||||
df_to_records(df),
|
||||
[
|
||||
{"is_date": False, "type": "STRING", "name": "a", "is_dim": True},
|
||||
{"is_date": False, "type": "STRING", "name": "b", "is_dim": True},
|
||||
{"is_date": False, "type": "STRING", "name": "c", "is_dim": True},
|
||||
{"a": 1, "b": "1239162456494753670", "c": "c1"},
|
||||
{"a": 2, "b": 100, "c": "c2"},
|
||||
],
|
||||
)
|
||||
|
||||
def test_get_columns_with_int(self):
|
||||
data = [("a1", 1), ("a2", 2)]
|
||||
cursor_descr = (("a", "string"), ("b", "int"))
|
||||
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(
|
||||
cdf.columns,
|
||||
[
|
||||
{"is_date": False, "type": "STRING", "name": "a", "is_dim": True},
|
||||
{
|
||||
"is_date": False,
|
||||
"type": "INT",
|
||||
"name": "b",
|
||||
"is_dim": False,
|
||||
"agg": "sum",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
def test_get_columns_type_inference(self):
|
||||
data = [(1.2, 1), (3.14, 2)]
|
||||
cursor_descr = (("a", None), ("b", None))
|
||||
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(
|
||||
cdf.columns,
|
||||
[
|
||||
{
|
||||
"is_date": False,
|
||||
"type": "FLOAT",
|
||||
"name": "a",
|
||||
"is_dim": False,
|
||||
"agg": "sum",
|
||||
},
|
||||
{
|
||||
"is_date": False,
|
||||
"type": "INT",
|
||||
"name": "b",
|
||||
"is_dim": False,
|
||||
"agg": "sum",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
def test_is_date(self):
|
||||
f = SupersetDataFrame.is_date
|
||||
self.assertEqual(f(np.dtype("M"), ""), True)
|
||||
self.assertEqual(f(np.dtype("f"), "DATETIME"), True)
|
||||
self.assertEqual(f(np.dtype("i"), "TIMESTAMP"), True)
|
||||
self.assertEqual(f(None, "DATETIME"), True)
|
||||
self.assertEqual(f(None, "TIMESTAMP"), True)
|
||||
|
||||
self.assertEqual(f(None, ""), False)
|
||||
self.assertEqual(f(np.dtype(np.int32), ""), False)
|
||||
|
||||
def test_dedup_with_data(self):
|
||||
data = [("a", 1), ("a", 2)]
|
||||
cursor_descr = (("a", "string"), ("a", "string"))
|
||||
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertListEqual(cdf.column_names, ["a", "a__1"])
|
||||
|
||||
def test_int64_with_missing_data(self):
|
||||
data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)]
|
||||
cursor_descr = [("user_id", "bigint", None, None, None, None, True)]
|
||||
|
||||
# the base engine spec does not provide a dtype based on the cursor
|
||||
# description, so the column is inferred as float64 because of the
|
||||
# missing data
|
||||
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
||||
np.testing.assert_array_equal(
|
||||
cdf.raw_df.values.tolist(),
|
||||
[[np.nan], [1.2391624564947538e18], [np.nan], [np.nan], [np.nan], [np.nan]],
|
||||
)
|
||||
|
||||
# currently only Presto provides a dtype based on the cursor description
|
||||
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
||||
np.testing.assert_array_equal(
|
||||
cdf.raw_df.values.tolist(),
|
||||
[[np.nan], [1239162456494753670], [np.nan], [np.nan], [np.nan], [np.nan]],
|
||||
)
|
||||
|
||||
def test_pandas_datetime64(self):
|
||||
data = [(None,)]
|
||||
cursor_descr = [("ds", "timestamp", None, None, None, None, True)]
|
||||
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
||||
self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("<M8[ns]"))
|
||||
|
||||
def test_no_type_coercion(self):
|
||||
data = [("a", 1), ("b", 2)]
|
||||
cursor_descr = [
|
||||
("one", "varchar", None, None, None, None, True),
|
||||
("two", "integer", None, None, None, None, True),
|
||||
]
|
||||
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
||||
self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O"))
|
||||
self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())
|
||||
|
||||
def test_empty_data(self):
|
||||
data = []
|
||||
cursor_descr = [
|
||||
("one", "varchar", None, None, None, None, True),
|
||||
("two", "integer", None, None, None, None, True),
|
||||
]
|
||||
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
||||
self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O"))
|
||||
self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from superset.dataframe import df_to_records
|
||||
from superset.db_engine_specs import BaseEngineSpec
|
||||
from superset.result_set import dedup, SupersetResultSet
|
||||
|
||||
from .base_tests import SupersetTestCase
|
||||
|
||||
|
||||
class SupersetResultSetTestCase(SupersetTestCase):
|
||||
def test_dedup(self):
|
||||
self.assertEqual(dedup(["foo", "bar"]), ["foo", "bar"])
|
||||
self.assertEqual(
|
||||
dedup(["foo", "bar", "foo", "bar", "Foo"]),
|
||||
["foo", "bar", "foo__1", "bar__1", "Foo"],
|
||||
)
|
||||
self.assertEqual(
|
||||
dedup(["foo", "bar", "bar", "bar", "Bar"]),
|
||||
["foo", "bar", "bar__1", "bar__2", "Bar"],
|
||||
)
|
||||
self.assertEqual(
|
||||
dedup(["foo", "bar", "bar", "bar", "Bar"], case_sensitive=False),
|
||||
["foo", "bar", "bar__1", "bar__2", "Bar__3"],
|
||||
)
|
||||
|
||||
def test_get_columns_basic(self):
|
||||
data = [("a1", "b1", "c1"), ("a2", "b2", "c2")]
|
||||
cursor_descr = (("a", "string"), ("b", "string"), ("c", "string"))
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(
|
||||
results.columns,
|
||||
[
|
||||
{"is_date": False, "type": "STRING", "name": "a"},
|
||||
{"is_date": False, "type": "STRING", "name": "b"},
|
||||
{"is_date": False, "type": "STRING", "name": "c"},
|
||||
],
|
||||
)
|
||||
|
||||
def test_get_columns_with_int(self):
|
||||
data = [("a1", 1), ("a2", 2)]
|
||||
cursor_descr = (("a", "string"), ("b", "int"))
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(
|
||||
results.columns,
|
||||
[
|
||||
{"is_date": False, "type": "STRING", "name": "a"},
|
||||
{"is_date": False, "type": "INT", "name": "b"},
|
||||
],
|
||||
)
|
||||
|
||||
def test_get_columns_type_inference(self):
|
||||
data = [
|
||||
(1.2, 1, "foo", datetime(2018, 10, 19, 23, 39, 16, 660000), True),
|
||||
(3.14, 2, "bar", datetime(2019, 10, 19, 23, 39, 16, 660000), False),
|
||||
]
|
||||
cursor_descr = (("a", None), ("b", None), ("c", None), ("d", None), ("e", None))
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(
|
||||
results.columns,
|
||||
[
|
||||
{"is_date": False, "type": "FLOAT", "name": "a"},
|
||||
{"is_date": False, "type": "INT", "name": "b"},
|
||||
{"is_date": False, "type": "STRING", "name": "c"},
|
||||
{"is_date": True, "type": "DATETIME", "name": "d"},
|
||||
{"is_date": False, "type": "BOOL", "name": "e"},
|
||||
],
|
||||
)
|
||||
|
||||
def test_is_date(self):
|
||||
is_date = SupersetResultSet.is_date
|
||||
self.assertEqual(is_date("DATETIME"), True)
|
||||
self.assertEqual(is_date("TIMESTAMP"), True)
|
||||
self.assertEqual(is_date("STRING"), False)
|
||||
self.assertEqual(is_date(""), False)
|
||||
self.assertEqual(is_date(None), False)
|
||||
|
||||
def test_dedup_with_data(self):
|
||||
data = [("a", 1), ("a", 2)]
|
||||
cursor_descr = (("a", "string"), ("a", "string"))
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
column_names = [col["name"] for col in results.columns]
|
||||
self.assertListEqual(column_names, ["a", "a__1"])
|
||||
|
||||
def test_int64_with_missing_data(self):
|
||||
data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)]
|
||||
cursor_descr = [("user_id", "bigint", None, None, None, None, True)]
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(results.columns[0]["type"], "BIGINT")
|
||||
|
||||
def test_nullable_bool(self):
|
||||
data = [(None,), (True,), (None,), (None,), (None,), (None,)]
|
||||
cursor_descr = [("is_test", "bool", None, None, None, None, True)]
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(results.columns[0]["type"], "BOOL")
|
||||
df = results.to_pandas_df()
|
||||
self.assertEqual(
|
||||
df_to_records(df),
|
||||
[
|
||||
{"is_test": None},
|
||||
{"is_test": True},
|
||||
{"is_test": None},
|
||||
{"is_test": None},
|
||||
{"is_test": None},
|
||||
{"is_test": None},
|
||||
],
|
||||
)
|
||||
|
||||
def test_empty_datetime(self):
|
||||
data = [(None,)]
|
||||
cursor_descr = [("ds", "timestamp", None, None, None, None, True)]
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(results.columns[0]["type"], "TIMESTAMP")
|
||||
|
||||
def test_no_type_coercion(self):
|
||||
data = [("a", 1), ("b", 2)]
|
||||
cursor_descr = [
|
||||
("one", "varchar", None, None, None, None, True),
|
||||
("two", "int", None, None, None, None, True),
|
||||
]
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(results.columns[0]["type"], "VARCHAR")
|
||||
self.assertEqual(results.columns[1]["type"], "INT")
|
||||
|
||||
def test_empty_data(self):
|
||||
data = []
|
||||
cursor_descr = [
|
||||
("emptyone", "varchar", None, None, None, None, True),
|
||||
("emptytwo", "int", None, None, None, None, True),
|
||||
]
|
||||
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
|
||||
self.assertEqual(results.columns, [])
|
|
@ -23,9 +23,10 @@ import prison
|
|||
|
||||
from superset import db, security_manager
|
||||
from superset.connectors.sqla.models import SqlaTable
|
||||
from superset.dataframe import SupersetDataFrame
|
||||
from superset.dataframe import df_to_records
|
||||
from superset.db_engine_specs import BaseEngineSpec
|
||||
from superset.models.sql_lab import Query
|
||||
from superset.result_set import SupersetResultSet
|
||||
from superset.utils.core import datetime_to_epoch, get_example_database
|
||||
|
||||
from .base_tests import SupersetTestCase
|
||||
|
@ -266,29 +267,29 @@ class SqlLabTests(SupersetTestCase):
|
|||
raise_on_error=True,
|
||||
)
|
||||
|
||||
def test_df_conversion_no_dict(self):
|
||||
def test_ps_conversion_no_dict(self):
|
||||
cols = [["string_col", "string"], ["int_col", "int"], ["float_col", "float"]]
|
||||
data = [["a", 4, 4.0]]
|
||||
cdf = SupersetDataFrame(data, cols, BaseEngineSpec)
|
||||
results = SupersetResultSet(data, cols, BaseEngineSpec)
|
||||
|
||||
self.assertEqual(len(data), cdf.size)
|
||||
self.assertEqual(len(cols), len(cdf.columns))
|
||||
self.assertEqual(len(data), results.size)
|
||||
self.assertEqual(len(cols), len(results.columns))
|
||||
|
||||
def test_df_conversion_tuple(self):
|
||||
def test_pa_conversion_tuple(self):
|
||||
cols = ["string_col", "int_col", "list_col", "float_col"]
|
||||
data = [("Text", 111, [123], 1.0)]
|
||||
cdf = SupersetDataFrame(data, cols, BaseEngineSpec)
|
||||
results = SupersetResultSet(data, cols, BaseEngineSpec)
|
||||
|
||||
self.assertEqual(len(data), cdf.size)
|
||||
self.assertEqual(len(cols), len(cdf.columns))
|
||||
self.assertEqual(len(data), results.size)
|
||||
self.assertEqual(len(cols), len(results.columns))
|
||||
|
||||
def test_df_conversion_dict(self):
|
||||
def test_pa_conversion_dict(self):
|
||||
cols = ["string_col", "dict_col", "int_col"]
|
||||
data = [["a", {"c1": 1, "c2": 2, "c3": 3}, 4]]
|
||||
cdf = SupersetDataFrame(data, cols, BaseEngineSpec)
|
||||
results = SupersetResultSet(data, cols, BaseEngineSpec)
|
||||
|
||||
self.assertEqual(len(data), cdf.size)
|
||||
self.assertEqual(len(cols), len(cdf.columns))
|
||||
self.assertEqual(len(data), results.size)
|
||||
self.assertEqual(len(cols), len(results.columns))
|
||||
|
||||
def test_sqllab_viz(self):
|
||||
self.login("admin")
|
||||
|
@ -298,19 +299,8 @@ class SqlLabTests(SupersetTestCase):
|
|||
"datasourceName": f"test_viz_flow_table_{random()}",
|
||||
"schema": "superset",
|
||||
"columns": [
|
||||
{
|
||||
"is_date": False,
|
||||
"type": "STRING",
|
||||
"name": f"viz_type_{random()}",
|
||||
"is_dim": True,
|
||||
},
|
||||
{
|
||||
"is_date": False,
|
||||
"type": "OBJECT",
|
||||
"name": f"ccount_{random()}",
|
||||
"is_dim": True,
|
||||
"agg": "sum",
|
||||
},
|
||||
{"is_date": False, "type": "STRING", "name": f"viz_type_{random()}"},
|
||||
{"is_date": False, "type": "OBJECT", "name": f"ccount_{random()}"},
|
||||
],
|
||||
"sql": """\
|
||||
SELECT *
|
||||
|
|
Loading…
Reference in New Issue