2019-01-15 18:53:27 -05:00
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
# or more contributor license agreements. See the NOTICE file
|
|
|
|
# distributed with this work for additional information
|
|
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
|
|
# to you under the Apache License, Version 2.0 (the
|
|
|
|
# "License"); you may not use this file except in compliance
|
|
|
|
# with the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing,
|
|
|
|
# software distributed under the License is distributed on an
|
|
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
# KIND, either express or implied. See the License for the
|
|
|
|
# specific language governing permissions and limitations
|
|
|
|
# under the License.
|
2018-08-14 16:01:28 -04:00
|
|
|
import numpy as np
|
2019-09-20 08:31:13 -04:00
|
|
|
import pandas as pd
|
2018-08-14 16:01:28 -04:00
|
|
|
|
2018-06-28 00:35:12 -04:00
|
|
|
from superset.dataframe import dedup, SupersetDataFrame
|
|
|
|
from superset.db_engine_specs import BaseEngineSpec
|
2019-09-17 11:16:09 -04:00
|
|
|
from superset.db_engine_specs.presto import PrestoEngineSpec
|
2018-06-28 00:35:12 -04:00
|
|
|
from .base_tests import SupersetTestCase
|
|
|
|
|
|
|
|
|
|
|
|
class SupersetDataFrameTestCase(SupersetTestCase):
|
|
|
|
def test_dedup(self):
|
2019-06-25 16:34:48 -04:00
|
|
|
self.assertEquals(dedup(["foo", "bar"]), ["foo", "bar"])
|
2018-06-28 00:35:12 -04:00
|
|
|
self.assertEquals(
|
2019-06-25 16:34:48 -04:00
|
|
|
dedup(["foo", "bar", "foo", "bar", "Foo"]),
|
|
|
|
["foo", "bar", "foo__1", "bar__1", "Foo"],
|
2018-06-28 00:35:12 -04:00
|
|
|
)
|
|
|
|
self.assertEquals(
|
2019-06-25 16:34:48 -04:00
|
|
|
dedup(["foo", "bar", "bar", "bar", "Bar"]),
|
|
|
|
["foo", "bar", "bar__1", "bar__2", "Bar"],
|
2018-06-28 00:35:12 -04:00
|
|
|
)
|
|
|
|
self.assertEquals(
|
2019-06-25 16:34:48 -04:00
|
|
|
dedup(["foo", "bar", "bar", "bar", "Bar"], case_sensitive=False),
|
|
|
|
["foo", "bar", "bar__1", "bar__2", "Bar__3"],
|
2018-06-28 00:35:12 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_columns_basic(self):
|
2019-06-25 16:34:48 -04:00
|
|
|
data = [("a1", "b1", "c1"), ("a2", "b2", "c2")]
|
|
|
|
cursor_descr = (("a", "string"), ("b", "string"), ("c", "string"))
|
2018-06-28 00:35:12 -04:00
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
|
|
|
self.assertEqual(
|
|
|
|
cdf.columns,
|
|
|
|
[
|
2019-06-25 16:34:48 -04:00
|
|
|
{"is_date": False, "type": "STRING", "name": "a", "is_dim": True},
|
|
|
|
{"is_date": False, "type": "STRING", "name": "b", "is_dim": True},
|
|
|
|
{"is_date": False, "type": "STRING", "name": "c", "is_dim": True},
|
2018-06-28 00:35:12 -04:00
|
|
|
],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_columns_with_int(self):
|
2019-06-25 16:34:48 -04:00
|
|
|
data = [("a1", 1), ("a2", 2)]
|
|
|
|
cursor_descr = (("a", "string"), ("b", "int"))
|
2018-06-28 00:35:12 -04:00
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
|
|
|
self.assertEqual(
|
|
|
|
cdf.columns,
|
|
|
|
[
|
2019-06-25 16:34:48 -04:00
|
|
|
{"is_date": False, "type": "STRING", "name": "a", "is_dim": True},
|
2018-06-28 00:35:12 -04:00
|
|
|
{
|
2019-06-25 16:34:48 -04:00
|
|
|
"is_date": False,
|
|
|
|
"type": "INT",
|
|
|
|
"name": "b",
|
|
|
|
"is_dim": False,
|
|
|
|
"agg": "sum",
|
2018-06-28 00:35:12 -04:00
|
|
|
},
|
|
|
|
],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_columns_type_inference(self):
|
2019-06-25 16:34:48 -04:00
|
|
|
data = [(1.2, 1), (3.14, 2)]
|
|
|
|
cursor_descr = (("a", None), ("b", None))
|
2018-06-28 00:35:12 -04:00
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
|
|
|
self.assertEqual(
|
|
|
|
cdf.columns,
|
|
|
|
[
|
|
|
|
{
|
2019-06-25 16:34:48 -04:00
|
|
|
"is_date": False,
|
|
|
|
"type": "FLOAT",
|
|
|
|
"name": "a",
|
|
|
|
"is_dim": False,
|
|
|
|
"agg": "sum",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"is_date": False,
|
|
|
|
"type": "INT",
|
|
|
|
"name": "b",
|
|
|
|
"is_dim": False,
|
|
|
|
"agg": "sum",
|
2018-06-28 00:35:12 -04:00
|
|
|
},
|
|
|
|
],
|
|
|
|
)
|
2018-07-23 13:41:38 -04:00
|
|
|
|
2018-08-14 16:01:28 -04:00
|
|
|
def test_is_date(self):
|
|
|
|
f = SupersetDataFrame.is_date
|
2019-06-25 16:34:48 -04:00
|
|
|
self.assertEquals(f(np.dtype("M"), ""), True)
|
|
|
|
self.assertEquals(f(np.dtype("f"), "DATETIME"), True)
|
|
|
|
self.assertEquals(f(np.dtype("i"), "TIMESTAMP"), True)
|
|
|
|
self.assertEquals(f(None, "DATETIME"), True)
|
|
|
|
self.assertEquals(f(None, "TIMESTAMP"), True)
|
2018-08-14 16:01:28 -04:00
|
|
|
|
2019-06-25 16:34:48 -04:00
|
|
|
self.assertEquals(f(None, ""), False)
|
|
|
|
self.assertEquals(f(np.dtype(np.int32), ""), False)
|
2018-08-14 16:01:28 -04:00
|
|
|
|
2018-07-23 13:41:38 -04:00
|
|
|
def test_dedup_with_data(self):
|
2019-06-25 16:34:48 -04:00
|
|
|
data = [("a", 1), ("a", 2)]
|
|
|
|
cursor_descr = (("a", "string"), ("a", "string"))
|
2018-07-23 13:41:38 -04:00
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
2019-06-25 16:34:48 -04:00
|
|
|
self.assertListEqual(cdf.column_names, ["a", "a__1"])
|
2019-09-17 11:16:09 -04:00
|
|
|
|
|
|
|
def test_int64_with_missing_data(self):
|
|
|
|
data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)]
|
|
|
|
cursor_descr = [("user_id", "bigint", None, None, None, None, True)]
|
|
|
|
|
|
|
|
# the base engine spec does not provide a dtype based on the cursor
|
|
|
|
# description, so the column is inferred as float64 because of the
|
|
|
|
# missing data
|
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
|
|
|
|
np.testing.assert_array_equal(
|
|
|
|
cdf.raw_df.values.tolist(),
|
|
|
|
[[np.nan], [1.2391624564947538e18], [np.nan], [np.nan], [np.nan], [np.nan]],
|
|
|
|
)
|
|
|
|
|
|
|
|
# currently only Presto provides a dtype based on the cursor description
|
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
|
|
|
np.testing.assert_array_equal(
|
|
|
|
cdf.raw_df.values.tolist(),
|
|
|
|
[[np.nan], [1239162456494753670], [np.nan], [np.nan], [np.nan], [np.nan]],
|
|
|
|
)
|
2019-09-18 15:46:50 -04:00
|
|
|
|
|
|
|
def test_pandas_datetime64(self):
|
|
|
|
data = [(None,)]
|
|
|
|
cursor_descr = [("ds", "timestamp", None, None, None, None, True)]
|
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
|
|
|
self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("<M8[ns]"))
|
2019-09-20 08:31:13 -04:00
|
|
|
|
|
|
|
def test_no_type_coercion(self):
|
|
|
|
data = [("a", 1), ("b", 2)]
|
|
|
|
cursor_descr = [
|
|
|
|
("one", "varchar", None, None, None, None, True),
|
|
|
|
("two", "integer", None, None, None, None, True),
|
|
|
|
]
|
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
|
|
|
self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O"))
|
|
|
|
self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())
|
|
|
|
|
|
|
|
def test_empty_data(self):
|
|
|
|
data = []
|
|
|
|
cursor_descr = [
|
|
|
|
("one", "varchar", None, None, None, None, True),
|
|
|
|
("two", "integer", None, None, None, None, True),
|
|
|
|
]
|
|
|
|
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
|
|
|
|
self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O"))
|
|
|
|
self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())
|