datetime format and database expression on column level (#652)

* time format minor features added

* add description for datetime format input

* db version bug walkaround

* removed unecessary comments and fixed minor bug

* fixed code style

* minor fix

* fixed missing time format column in DruidDatasource

* Update models.py

Minor style fix

* Revert "Update models.py"

This reverts commit 6897c388e0.

* removed timestamp_format from druid and removed try catch in migration

* Using spaces, not tabs

* get the most updated migration and add the migration on the head of it

* remove vscode setting file

* use colunm based dttm_format

* modify dttm_converter

* modify datetime viz

* added comments and documents

* fixed some description and removed unnecessary import

* fix migration head

* minor style

* minor style

* deleted empty lines

* delete print statement

* add epoch converter

* error fixed

* fixed epoch parsing issue

* delete unnecessary lines

* fixed typo

* fix minor error

* fix styling issues

* fix styling error

* fixed typo

* support epoch_ms and did some refactoring

* fixed styling error

* fixed styling error

* add one more dataset to test dttm_format and db_expr

* add more slices

* styling

* specified String() lenght
This commit is contained in:
yxjames 2016-06-27 21:33:44 -07:00 committed by Maxime Beauchemin
parent 3e742c74bb
commit 7a7f61a296
7 changed files with 203 additions and 28 deletions

View File

@ -97,6 +97,9 @@ def load_examples(load_test_data):
print("Loading [Random long/lat data]")
data.load_long_lat_data()
print("Loading [Multiformat time series]")
data.load_multiformat_time_series_data()
if load_test_data:
print("Loading [Unicode test data]")
data.load_unicode_test_data()

View File

@ -12,7 +12,7 @@ import datetime
import random
import pandas as pd
from sqlalchemy import String, DateTime, Date, Float
from sqlalchemy import String, DateTime, Date, Float, BigInteger
from caravel import app, db, models, utils
@ -1020,3 +1020,84 @@ def load_long_lat_data():
params=get_slice_json(slice_data),
)
merge_slice(slc)
def load_multiformat_time_series_data():
"""Loading time series data from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
pdf = pd.read_json(f)
pdf.ds = pd.to_datetime(pdf.ds, unit='s')
pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
pdf.to_sql(
'multiformat_time_series',
db.engine,
if_exists='replace',
chunksize=500,
dtype={
"ds": Date,
'ds2': DateTime,
"epoch_s": BigInteger,
"epoch_ms": BigInteger,
"string0": String(100),
"string1": String(100),
"string2": String(100),
"string3": String(100),
},
index=False)
print("Done loading table!")
print("-" * 80)
print("Creating table [multiformat_time_series] reference")
obj = db.session.query(TBL).filter_by(table_name='multiformat_time_series').first()
if not obj:
obj = TBL(table_name='multiformat_time_series')
obj.main_dttm_col = 'ds'
obj.database = get_or_create_db(db.session)
obj.is_featured = False
dttm_and_expr_dict = {
'ds': [None, None],
'ds2': [None, None],
'epoch_s': ['epoch_s', None],
'epoch_ms': ['epoch_ms', None],
'string2': ['%Y%m%d-%H%M%S', None],
'string1': ['%Y-%m-%d^%H:%M:%S', None],
'string0': ['%Y-%m-%d %H:%M:%S.%f', None],
'string3': ['%Y/%m/%d%H:%M:%S.%f', None],
}
for col in obj.table_columns:
print(col.column_name)
dttm_and_expr = dttm_and_expr_dict[col.column_name]
col.python_date_format = dttm_and_expr[0]
col.dbatabase_expr = dttm_and_expr[1]
db.session.merge(obj)
db.session.commit()
obj.fetch_metadata()
tbl = obj
print("Creating some slices")
i = 0
for col in tbl.table_columns:
slice_data = {
"granularity_sqla": col.column_name,
"datasource_id": "8",
"datasource_name": "multiformat_time_series",
"datasource_type": "table",
"granularity": "day",
"row_limit": config.get("ROW_LIMIT"),
"since": "1 year ago",
"until": "now",
"where": "",
"viz_type": "cal_heatmap",
"domain_granularity": "month",
"subdomain_granularity": "day",
}
slc = Slice(
slice_name="Calendar Heatmap multiformat" + str(i),
viz_type='cal_heatmap',
datasource_type='table',
table=tbl,
params=get_slice_json(slice_data),
)
i += 1
merge_slice(slc)

Binary file not shown.

View File

@ -0,0 +1,24 @@
"""add dttm_format related fields in table_columns
Revision ID: 960c69cb1f5b
Revises: d8bc074f7aad
Create Date: 2016-06-16 14:15:19.573183
"""
# revision identifiers, used by Alembic.
revision = '960c69cb1f5b'
down_revision = 'd8bc074f7aad'
from alembic import op
import sqlalchemy as sa
def upgrade():
op.add_column('table_columns', sa.Column('python_date_format', sa.String(length=255), nullable=True))
op.add_column('table_columns', sa.Column('database_expression', sa.String(length=255), nullable=True))
def downgrade():
op.drop_column('table_columns', 'python_date_format')
op.drop_column('table_columns', 'database_expression')

View File

@ -445,24 +445,6 @@ class Database(Model, AuditMixinNullable):
if self.sqlalchemy_uri.startswith(db_type):
return grains
def dttm_converter(self, dttm):
"""Returns a string that the database flavor understands as a date"""
default = "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S.%f'))
iso = dttm.isoformat()
d = {
'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), #untested
'mysql': default,
'oracle':
"""TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
dttm.isoformat()),
'presto': default,
'sqlite': default,
}
for k, v in d.items():
if self.sqlalchemy_uri.startswith(k):
return v
return default
def grains_dict(self):
return {grain.name: grain for grain in self.grains()}
@ -525,6 +507,7 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
offset = Column(Integer, default=0)
cache_timeout = Column(Integer)
schema = Column(String(255))
table_columns = relationship("TableColumn", back_populates="table")
baselink = "tablemodelview"
@ -607,6 +590,12 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
def sql_link(self):
return '<a href="{}">SQL</a>'.format(self.sql_url)
def get_col(self, col_name):
columns = self.table_columns
for col in columns:
if col_name == col.column_name:
return col
def query( # sqla
self, groupby, metrics,
granularity,
@ -661,7 +650,8 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
metrics_exprs = []
if granularity:
dttm_expr = cols[granularity].sqla_col.label('timestamp')
dttm_col = cols[granularity]
dttm_expr = dttm_col.sqla_col.label('timestamp')
timestamp = dttm_expr
# Transforming time grain into an expression based on configuration
@ -677,18 +667,20 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
select_exprs += [timestamp_grain]
groupby_exprs += [timestamp_grain]
tf = '%Y-%m-%d %H:%M:%S.%f'
outer_from = text(dttm_col.dttm_sql_literal(from_dttm))
outer_to = text(dttm_col.dttm_sql_literal(to_dttm))
time_filter = [
timestamp >= text(self.database.dttm_converter(from_dttm)),
timestamp <= text(self.database.dttm_converter(to_dttm)),
timestamp >= outer_from,
timestamp <= outer_to,
]
inner_time_filter = copy(time_filter)
if inner_from_dttm:
inner_time_filter[0] = timestamp >= text(
self.database.dttm_converter(inner_from_dttm))
dttm_col.dttm_sql_literal(inner_from_dttm))
if inner_to_dttm:
inner_time_filter[1] = timestamp <= text(
self.database.dttm_converter(inner_to_dttm))
dttm_col.dttm_sql_literal(inner_to_dttm))
else:
inner_time_filter = []
@ -909,6 +901,8 @@ class TableColumn(Model, AuditMixinNullable):
filterable = Column(Boolean, default=False)
expression = Column(Text, default='')
description = Column(Text, default='')
python_date_format = Column(String(255))
database_expression = Column(String(255))
num_types = ('DOUBLE', 'FLOAT', 'INT', 'BIGINT', 'LONG')
date_types = ('DATE', 'TIME')
@ -938,6 +932,39 @@ class TableColumn(Model, AuditMixinNullable):
col = literal_column(self.expression).label(name)
return col
def dttm_sql_literal(self, dttm):
"""Convert datetime object to string
If datebase_expression is empty, the internal dttm
will be parsed as the string with the pattern that
user input (python_date_format)
If database_expression is not empty, the internal dttm
will be parsed as the sql sentence for datebase to convert
"""
tf = self.python_date_format or '%Y-%m-%d %H:%M:%S.%f'
if self.database_expression:
return self.database_expression.format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
elif tf == 'epoch_s':
return str((dttm - datetime(1970, 1, 1)).total_seconds())
elif tf == 'epoch_ms':
return str((dttm - datetime(1970, 1, 1)).total_seconds()*1000.0)
else:
default = "'{}'".format(dttm.strftime(tf))
iso = dttm.isoformat()
d = {
'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), # untested
'mysql': default,
'oracle':
"""TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
dttm.isoformat()),
'presto': default,
'sqlite': default,
}
for k, v in d.items():
if self.table.database.sqlalchemy_uri.startswith(k):
return v
return default
class DruidCluster(Model, AuditMixinNullable):

View File

@ -187,7 +187,7 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
edit_columns = [
'column_name', 'verbose_name', 'description', 'groupby', 'filterable',
'table', 'count_distinct', 'sum', 'min', 'max', 'expression',
'is_dttm', ]
'is_dttm', 'python_date_format', 'database_expression']
add_columns = edit_columns
list_columns = [
'column_name', 'type', 'groupby', 'filterable', 'count_distinct',
@ -201,6 +201,24 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
'expression': utils.markdown(
"a valid SQL expression as supported by the underlying backend. "
"Example: `substr(name, 1, 1)`", True),
'python_date_format': utils.markdown(Markup(
"The pattern of timestamp format, use "
"<a href='https://docs.python.org/2/library/"
"datetime.html#strftime-strptime-behavior'>"
"python datetime string pattern</a> "
"expression. If time is stored in epoch "
"format, put `epoch_s` or `epoch_ms`. Leave `Database Expression` "
"below empty if timestamp is stored in "
"String or Integer(epoch) type"), True),
'database_expression': utils.markdown(
"The database expression to cast internal datetime "
"constants to database date/timestamp type according to the DBAPI. "
"The expression should follow the pattern of "
"%Y-%m-%d %H:%M:%S, based on different DBAPI. "
"The string should be a python string formatter \n"
"`Ex: TO_DATE('{}', 'YYYY-MM-DD HH24:MI:SS')` for Oracle"
"Caravel uses default expression based on DB URI if this "
"field is blank.", True),
}
label_columns = {
'column_name': _("Column"),
@ -215,6 +233,8 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
'max': _("Max"),
'expression': _("Expression"),
'is_dttm': _("Is temporal"),
'python_date_format': _("Datetime Format"),
'database_expression': _("Database Expression")
}
appbuilder.add_view_no_menu(TableColumnInlineView)
@ -388,7 +408,8 @@ class TableModelView(CaravelModelView, DeleteMixin): # noqa
'table_name', 'database', 'schema',
'default_endpoint', 'offset', 'cache_timeout']
edit_columns = [
'table_name', 'is_featured', 'database', 'schema', 'description', 'owner',
'table_name', 'is_featured', 'database', 'schema',
'description', 'owner',
'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout']
related_views = [TableColumnInlineView, SqlMetricInlineView]
base_order = ('changed_on', 'desc')

View File

@ -146,15 +146,34 @@ class BaseViz(object):
self.error_msg = ""
self.results = None
timestamp_format = None
if self.datasource.type == 'table':
dttm_col = self.datasource.get_col(query_obj['granularity'])
if dttm_col:
timestamp_format = dttm_col.python_date_format
# The datasource here can be different backend but the interface is common
self.results = self.datasource.query(**query_obj)
self.query = self.results.query
df = self.results.df
# Transform the timestamp we received from database to pandas supported
# datetime format. If no python_date_format is specified, the pattern will
# be considered as the default ISO date format
# If the datetime format is unix, the parse will use the corresponding
# parsing logic.
if df is None or df.empty:
raise Exception("No data, review your incantations!")
else:
if 'timestamp' in df.columns:
df.timestamp = pd.to_datetime(df.timestamp, utc=False)
if timestamp_format == "epoch_s":
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, unit="s")
elif timestamp_format == "epoch_ms":
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, unit="ms")
else:
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, format=timestamp_format)
if self.datasource.offset:
df.timestamp += timedelta(hours=self.datasource.offset)
df.replace([np.inf, -np.inf], np.nan)