datetime format and database expression on column level (#652)

* time format minor features added

* add description for datetime format input

* db version bug walkaround

* removed unecessary comments and fixed minor bug

* fixed code style

* minor fix

* fixed missing time format column in DruidDatasource

* Update models.py

Minor style fix

* Revert "Update models.py"

This reverts commit 6897c388e0.

* removed timestamp_format from druid and removed try catch in migration

* Using spaces, not tabs

* get the most updated migration and add the migration on the head of it

* remove vscode setting file

* use colunm based dttm_format

* modify dttm_converter

* modify datetime viz

* added comments and documents

* fixed some description and removed unnecessary import

* fix migration head

* minor style

* minor style

* deleted empty lines

* delete print statement

* add epoch converter

* error fixed

* fixed epoch parsing issue

* delete unnecessary lines

* fixed typo

* fix minor error

* fix styling issues

* fix styling error

* fixed typo

* support epoch_ms and did some refactoring

* fixed styling error

* fixed styling error

* add one more dataset to test dttm_format and db_expr

* add more slices

* styling

* specified String() lenght
This commit is contained in:
yxjames 2016-06-27 21:33:44 -07:00 committed by Maxime Beauchemin
parent 3e742c74bb
commit 7a7f61a296
7 changed files with 203 additions and 28 deletions

View File

@ -97,6 +97,9 @@ def load_examples(load_test_data):
print("Loading [Random long/lat data]") print("Loading [Random long/lat data]")
data.load_long_lat_data() data.load_long_lat_data()
print("Loading [Multiformat time series]")
data.load_multiformat_time_series_data()
if load_test_data: if load_test_data:
print("Loading [Unicode test data]") print("Loading [Unicode test data]")
data.load_unicode_test_data() data.load_unicode_test_data()

View File

@ -12,7 +12,7 @@ import datetime
import random import random
import pandas as pd import pandas as pd
from sqlalchemy import String, DateTime, Date, Float from sqlalchemy import String, DateTime, Date, Float, BigInteger
from caravel import app, db, models, utils from caravel import app, db, models, utils
@ -1020,3 +1020,84 @@ def load_long_lat_data():
params=get_slice_json(slice_data), params=get_slice_json(slice_data),
) )
merge_slice(slc) merge_slice(slc)
def load_multiformat_time_series_data():
"""Loading time series data from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
pdf = pd.read_json(f)
pdf.ds = pd.to_datetime(pdf.ds, unit='s')
pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
pdf.to_sql(
'multiformat_time_series',
db.engine,
if_exists='replace',
chunksize=500,
dtype={
"ds": Date,
'ds2': DateTime,
"epoch_s": BigInteger,
"epoch_ms": BigInteger,
"string0": String(100),
"string1": String(100),
"string2": String(100),
"string3": String(100),
},
index=False)
print("Done loading table!")
print("-" * 80)
print("Creating table [multiformat_time_series] reference")
obj = db.session.query(TBL).filter_by(table_name='multiformat_time_series').first()
if not obj:
obj = TBL(table_name='multiformat_time_series')
obj.main_dttm_col = 'ds'
obj.database = get_or_create_db(db.session)
obj.is_featured = False
dttm_and_expr_dict = {
'ds': [None, None],
'ds2': [None, None],
'epoch_s': ['epoch_s', None],
'epoch_ms': ['epoch_ms', None],
'string2': ['%Y%m%d-%H%M%S', None],
'string1': ['%Y-%m-%d^%H:%M:%S', None],
'string0': ['%Y-%m-%d %H:%M:%S.%f', None],
'string3': ['%Y/%m/%d%H:%M:%S.%f', None],
}
for col in obj.table_columns:
print(col.column_name)
dttm_and_expr = dttm_and_expr_dict[col.column_name]
col.python_date_format = dttm_and_expr[0]
col.dbatabase_expr = dttm_and_expr[1]
db.session.merge(obj)
db.session.commit()
obj.fetch_metadata()
tbl = obj
print("Creating some slices")
i = 0
for col in tbl.table_columns:
slice_data = {
"granularity_sqla": col.column_name,
"datasource_id": "8",
"datasource_name": "multiformat_time_series",
"datasource_type": "table",
"granularity": "day",
"row_limit": config.get("ROW_LIMIT"),
"since": "1 year ago",
"until": "now",
"where": "",
"viz_type": "cal_heatmap",
"domain_granularity": "month",
"subdomain_granularity": "day",
}
slc = Slice(
slice_name="Calendar Heatmap multiformat" + str(i),
viz_type='cal_heatmap',
datasource_type='table',
table=tbl,
params=get_slice_json(slice_data),
)
i += 1
merge_slice(slc)

Binary file not shown.

View File

@ -0,0 +1,24 @@
"""add dttm_format related fields in table_columns
Revision ID: 960c69cb1f5b
Revises: d8bc074f7aad
Create Date: 2016-06-16 14:15:19.573183
"""
# revision identifiers, used by Alembic.
revision = '960c69cb1f5b'
down_revision = 'd8bc074f7aad'
from alembic import op
import sqlalchemy as sa
def upgrade():
op.add_column('table_columns', sa.Column('python_date_format', sa.String(length=255), nullable=True))
op.add_column('table_columns', sa.Column('database_expression', sa.String(length=255), nullable=True))
def downgrade():
op.drop_column('table_columns', 'python_date_format')
op.drop_column('table_columns', 'database_expression')

View File

@ -445,24 +445,6 @@ class Database(Model, AuditMixinNullable):
if self.sqlalchemy_uri.startswith(db_type): if self.sqlalchemy_uri.startswith(db_type):
return grains return grains
def dttm_converter(self, dttm):
"""Returns a string that the database flavor understands as a date"""
default = "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S.%f'))
iso = dttm.isoformat()
d = {
'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), #untested
'mysql': default,
'oracle':
"""TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
dttm.isoformat()),
'presto': default,
'sqlite': default,
}
for k, v in d.items():
if self.sqlalchemy_uri.startswith(k):
return v
return default
def grains_dict(self): def grains_dict(self):
return {grain.name: grain for grain in self.grains()} return {grain.name: grain for grain in self.grains()}
@ -525,6 +507,7 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
offset = Column(Integer, default=0) offset = Column(Integer, default=0)
cache_timeout = Column(Integer) cache_timeout = Column(Integer)
schema = Column(String(255)) schema = Column(String(255))
table_columns = relationship("TableColumn", back_populates="table")
baselink = "tablemodelview" baselink = "tablemodelview"
@ -607,6 +590,12 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
def sql_link(self): def sql_link(self):
return '<a href="{}">SQL</a>'.format(self.sql_url) return '<a href="{}">SQL</a>'.format(self.sql_url)
def get_col(self, col_name):
columns = self.table_columns
for col in columns:
if col_name == col.column_name:
return col
def query( # sqla def query( # sqla
self, groupby, metrics, self, groupby, metrics,
granularity, granularity,
@ -661,7 +650,8 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
metrics_exprs = [] metrics_exprs = []
if granularity: if granularity:
dttm_expr = cols[granularity].sqla_col.label('timestamp') dttm_col = cols[granularity]
dttm_expr = dttm_col.sqla_col.label('timestamp')
timestamp = dttm_expr timestamp = dttm_expr
# Transforming time grain into an expression based on configuration # Transforming time grain into an expression based on configuration
@ -677,18 +667,20 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
select_exprs += [timestamp_grain] select_exprs += [timestamp_grain]
groupby_exprs += [timestamp_grain] groupby_exprs += [timestamp_grain]
tf = '%Y-%m-%d %H:%M:%S.%f' outer_from = text(dttm_col.dttm_sql_literal(from_dttm))
outer_to = text(dttm_col.dttm_sql_literal(to_dttm))
time_filter = [ time_filter = [
timestamp >= text(self.database.dttm_converter(from_dttm)), timestamp >= outer_from,
timestamp <= text(self.database.dttm_converter(to_dttm)), timestamp <= outer_to,
] ]
inner_time_filter = copy(time_filter) inner_time_filter = copy(time_filter)
if inner_from_dttm: if inner_from_dttm:
inner_time_filter[0] = timestamp >= text( inner_time_filter[0] = timestamp >= text(
self.database.dttm_converter(inner_from_dttm)) dttm_col.dttm_sql_literal(inner_from_dttm))
if inner_to_dttm: if inner_to_dttm:
inner_time_filter[1] = timestamp <= text( inner_time_filter[1] = timestamp <= text(
self.database.dttm_converter(inner_to_dttm)) dttm_col.dttm_sql_literal(inner_to_dttm))
else: else:
inner_time_filter = [] inner_time_filter = []
@ -909,6 +901,8 @@ class TableColumn(Model, AuditMixinNullable):
filterable = Column(Boolean, default=False) filterable = Column(Boolean, default=False)
expression = Column(Text, default='') expression = Column(Text, default='')
description = Column(Text, default='') description = Column(Text, default='')
python_date_format = Column(String(255))
database_expression = Column(String(255))
num_types = ('DOUBLE', 'FLOAT', 'INT', 'BIGINT', 'LONG') num_types = ('DOUBLE', 'FLOAT', 'INT', 'BIGINT', 'LONG')
date_types = ('DATE', 'TIME') date_types = ('DATE', 'TIME')
@ -938,6 +932,39 @@ class TableColumn(Model, AuditMixinNullable):
col = literal_column(self.expression).label(name) col = literal_column(self.expression).label(name)
return col return col
def dttm_sql_literal(self, dttm):
"""Convert datetime object to string
If datebase_expression is empty, the internal dttm
will be parsed as the string with the pattern that
user input (python_date_format)
If database_expression is not empty, the internal dttm
will be parsed as the sql sentence for datebase to convert
"""
tf = self.python_date_format or '%Y-%m-%d %H:%M:%S.%f'
if self.database_expression:
return self.database_expression.format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
elif tf == 'epoch_s':
return str((dttm - datetime(1970, 1, 1)).total_seconds())
elif tf == 'epoch_ms':
return str((dttm - datetime(1970, 1, 1)).total_seconds()*1000.0)
else:
default = "'{}'".format(dttm.strftime(tf))
iso = dttm.isoformat()
d = {
'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), # untested
'mysql': default,
'oracle':
"""TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
dttm.isoformat()),
'presto': default,
'sqlite': default,
}
for k, v in d.items():
if self.table.database.sqlalchemy_uri.startswith(k):
return v
return default
class DruidCluster(Model, AuditMixinNullable): class DruidCluster(Model, AuditMixinNullable):

View File

@ -187,7 +187,7 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
edit_columns = [ edit_columns = [
'column_name', 'verbose_name', 'description', 'groupby', 'filterable', 'column_name', 'verbose_name', 'description', 'groupby', 'filterable',
'table', 'count_distinct', 'sum', 'min', 'max', 'expression', 'table', 'count_distinct', 'sum', 'min', 'max', 'expression',
'is_dttm', ] 'is_dttm', 'python_date_format', 'database_expression']
add_columns = edit_columns add_columns = edit_columns
list_columns = [ list_columns = [
'column_name', 'type', 'groupby', 'filterable', 'count_distinct', 'column_name', 'type', 'groupby', 'filterable', 'count_distinct',
@ -201,6 +201,24 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
'expression': utils.markdown( 'expression': utils.markdown(
"a valid SQL expression as supported by the underlying backend. " "a valid SQL expression as supported by the underlying backend. "
"Example: `substr(name, 1, 1)`", True), "Example: `substr(name, 1, 1)`", True),
'python_date_format': utils.markdown(Markup(
"The pattern of timestamp format, use "
"<a href='https://docs.python.org/2/library/"
"datetime.html#strftime-strptime-behavior'>"
"python datetime string pattern</a> "
"expression. If time is stored in epoch "
"format, put `epoch_s` or `epoch_ms`. Leave `Database Expression` "
"below empty if timestamp is stored in "
"String or Integer(epoch) type"), True),
'database_expression': utils.markdown(
"The database expression to cast internal datetime "
"constants to database date/timestamp type according to the DBAPI. "
"The expression should follow the pattern of "
"%Y-%m-%d %H:%M:%S, based on different DBAPI. "
"The string should be a python string formatter \n"
"`Ex: TO_DATE('{}', 'YYYY-MM-DD HH24:MI:SS')` for Oracle"
"Caravel uses default expression based on DB URI if this "
"field is blank.", True),
} }
label_columns = { label_columns = {
'column_name': _("Column"), 'column_name': _("Column"),
@ -215,6 +233,8 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
'max': _("Max"), 'max': _("Max"),
'expression': _("Expression"), 'expression': _("Expression"),
'is_dttm': _("Is temporal"), 'is_dttm': _("Is temporal"),
'python_date_format': _("Datetime Format"),
'database_expression': _("Database Expression")
} }
appbuilder.add_view_no_menu(TableColumnInlineView) appbuilder.add_view_no_menu(TableColumnInlineView)
@ -388,7 +408,8 @@ class TableModelView(CaravelModelView, DeleteMixin): # noqa
'table_name', 'database', 'schema', 'table_name', 'database', 'schema',
'default_endpoint', 'offset', 'cache_timeout'] 'default_endpoint', 'offset', 'cache_timeout']
edit_columns = [ edit_columns = [
'table_name', 'is_featured', 'database', 'schema', 'description', 'owner', 'table_name', 'is_featured', 'database', 'schema',
'description', 'owner',
'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout'] 'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout']
related_views = [TableColumnInlineView, SqlMetricInlineView] related_views = [TableColumnInlineView, SqlMetricInlineView]
base_order = ('changed_on', 'desc') base_order = ('changed_on', 'desc')

View File

@ -146,15 +146,34 @@ class BaseViz(object):
self.error_msg = "" self.error_msg = ""
self.results = None self.results = None
timestamp_format = None
if self.datasource.type == 'table':
dttm_col = self.datasource.get_col(query_obj['granularity'])
if dttm_col:
timestamp_format = dttm_col.python_date_format
# The datasource here can be different backend but the interface is common # The datasource here can be different backend but the interface is common
self.results = self.datasource.query(**query_obj) self.results = self.datasource.query(**query_obj)
self.query = self.results.query self.query = self.results.query
df = self.results.df df = self.results.df
# Transform the timestamp we received from database to pandas supported
# datetime format. If no python_date_format is specified, the pattern will
# be considered as the default ISO date format
# If the datetime format is unix, the parse will use the corresponding
# parsing logic.
if df is None or df.empty: if df is None or df.empty:
raise Exception("No data, review your incantations!") raise Exception("No data, review your incantations!")
else: else:
if 'timestamp' in df.columns: if 'timestamp' in df.columns:
df.timestamp = pd.to_datetime(df.timestamp, utc=False) if timestamp_format == "epoch_s":
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, unit="s")
elif timestamp_format == "epoch_ms":
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, unit="ms")
else:
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, format=timestamp_format)
if self.datasource.offset: if self.datasource.offset:
df.timestamp += timedelta(hours=self.datasource.offset) df.timestamp += timedelta(hours=self.datasource.offset)
df.replace([np.inf, -np.inf], np.nan) df.replace([np.inf, -np.inf], np.nan)