datetime format and database expression on column level (#652)

* time format minor features added * add description for datetime format input * db version bug walkaround * removed unecessary comments and fixed minor bug * fixed code style * minor fix * fixed missing time format column in DruidDatasource * Update models.py Minor style fix * Revert "Update models.py" This reverts commit 6897c388e0. * removed timestamp_format from druid and removed try catch in migration * Using spaces, not tabs * get the most updated migration and add the migration on the head of it * remove vscode setting file * use colunm based dttm_format * modify dttm_converter * modify datetime viz * added comments and documents * fixed some description and removed unnecessary import * fix migration head * minor style * minor style * deleted empty lines * delete print statement * add epoch converter * error fixed * fixed epoch parsing issue * delete unnecessary lines * fixed typo * fix minor error * fix styling issues * fix styling error * fixed typo * support epoch_ms and did some refactoring * fixed styling error * fixed styling error * add one more dataset to test dttm_format and db_expr * add more slices * styling * specified String() lenght
2016-06-27 21:33:44 -07:00 · 2016-06-27 21:33:44 -07:00 · 7a7f61a296
parent 3e742c74bb
commit 7a7f61a296
7 changed files with 203 additions and 28 deletions
--- a/caravel/bin/caravel
+++ b/caravel/bin/caravel
@ -97,6 +97,9 @@ def load_examples(load_test_data):
    print("Loading [Random long/lat data]")
    data.load_long_lat_data()

+    print("Loading [Multiformat time series]")
+    data.load_multiformat_time_series_data()
+
    if load_test_data:
        print("Loading [Unicode test data]")
        data.load_unicode_test_data()
--- a/caravel/data/init.py
+++ b/caravel/data/init.py
@ -12,7 +12,7 @@ import datetime
 import random

 import pandas as pd
-from sqlalchemy import String, DateTime, Date, Float
+from sqlalchemy import String, DateTime, Date, Float, BigInteger

 from caravel import app, db, models, utils

@ -1020,3 +1020,84 @@ def load_long_lat_data():
        params=get_slice_json(slice_data),
    )
    merge_slice(slc)
+
+
+def load_multiformat_time_series_data():
+
+    """Loading time series data from a zip file in the repo"""
+    with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
+        pdf = pd.read_json(f)
+    pdf.ds = pd.to_datetime(pdf.ds, unit='s')
+    pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
+    pdf.to_sql(
+        'multiformat_time_series',
+        db.engine,
+        if_exists='replace',
+        chunksize=500,
+        dtype={
+            "ds": Date,
+            'ds2': DateTime,
+            "epoch_s": BigInteger,
+            "epoch_ms": BigInteger,
+            "string0": String(100),
+            "string1": String(100),
+            "string2": String(100),
+            "string3": String(100),
+        },
+        index=False)
+    print("Done loading table!")
+    print("-" * 80)
+    print("Creating table [multiformat_time_series] reference")
+    obj = db.session.query(TBL).filter_by(table_name='multiformat_time_series').first()
+    if not obj:
+        obj = TBL(table_name='multiformat_time_series')
+    obj.main_dttm_col = 'ds'
+    obj.database = get_or_create_db(db.session)
+    obj.is_featured = False
+    dttm_and_expr_dict = {
+        'ds': [None, None],
+        'ds2': [None, None],
+        'epoch_s': ['epoch_s', None],
+        'epoch_ms': ['epoch_ms', None],
+        'string2': ['%Y%m%d-%H%M%S', None],
+        'string1': ['%Y-%m-%d^%H:%M:%S', None],
+        'string0': ['%Y-%m-%d %H:%M:%S.%f', None],
+        'string3': ['%Y/%m/%d%H:%M:%S.%f', None],
+    }
+    for col in obj.table_columns:
+        print(col.column_name)
+        dttm_and_expr = dttm_and_expr_dict[col.column_name]
+        col.python_date_format = dttm_and_expr[0]
+        col.dbatabase_expr = dttm_and_expr[1]
+    db.session.merge(obj)
+    db.session.commit()
+    obj.fetch_metadata()
+    tbl = obj
+
+    print("Creating some slices")
+    i = 0
+    for col in tbl.table_columns:
+        slice_data = {
+            "granularity_sqla": col.column_name,
+            "datasource_id": "8",
+            "datasource_name": "multiformat_time_series",
+            "datasource_type": "table",
+            "granularity": "day",
+            "row_limit": config.get("ROW_LIMIT"),
+            "since": "1 year ago",
+            "until": "now",
+            "where": "",
+            "viz_type": "cal_heatmap",
+            "domain_granularity": "month",
+            "subdomain_granularity": "day",
+        }
+
+        slc = Slice(
+            slice_name="Calendar Heatmap multiformat" + str(i),
+            viz_type='cal_heatmap',
+            datasource_type='table',
+            table=tbl,
+            params=get_slice_json(slice_data),
+        )
+        i += 1
+        merge_slice(slc)
--- a/caravel/data/multiformat_time_series.json.gz
+++ b/caravel/data/multiformat_time_series.json.gz
--- a/caravel/migrations/versions/960c69cb1f5b_.py
+++ b/caravel/migrations/versions/960c69cb1f5b_.py
@ -0,0 +1,24 @@
+"""add dttm_format related fields in table_columns
+
+Revision ID: 960c69cb1f5b
+Revises: d8bc074f7aad
+Create Date: 2016-06-16 14:15:19.573183
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '960c69cb1f5b'
+down_revision = 'd8bc074f7aad'
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade():
+    op.add_column('table_columns', sa.Column('python_date_format', sa.String(length=255), nullable=True))
+    op.add_column('table_columns', sa.Column('database_expression', sa.String(length=255), nullable=True))
+
+
+def downgrade():
+    op.drop_column('table_columns', 'python_date_format')
+    op.drop_column('table_columns', 'database_expression')
--- a/caravel/models.py
+++ b/caravel/models.py
@ -445,24 +445,6 @@ class Database(Model, AuditMixinNullable):
            if self.sqlalchemy_uri.startswith(db_type):
                return grains

-    def dttm_converter(self, dttm):
-        """Returns a string that the database flavor understands as a date"""
-        default = "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S.%f'))
-        iso = dttm.isoformat()
-        d = {
-            'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), #untested
-            'mysql': default,
-            'oracle':
-                """TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
-                    dttm.isoformat()),
-            'presto': default,
-            'sqlite': default,
-        }
-        for k, v in d.items():
-            if self.sqlalchemy_uri.startswith(k):
-                return v
-        return default
-
    def grains_dict(self):
        return {grain.name: grain for grain in self.grains()}

@ -525,6 +507,7 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
    offset = Column(Integer, default=0)
    cache_timeout = Column(Integer)
    schema = Column(String(255))
+    table_columns = relationship("TableColumn", back_populates="table")

    baselink = "tablemodelview"

@ -607,6 +590,12 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
    def sql_link(self):
        return '<a href="{}">SQL</a>'.format(self.sql_url)

+    def get_col(self, col_name):
+        columns = self.table_columns
+        for col in columns:
+            if col_name == col.column_name:
+                return col
+
    def query(  # sqla
            self, groupby, metrics,
            granularity,
@ -661,7 +650,8 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
            metrics_exprs = []

        if granularity:
-            dttm_expr = cols[granularity].sqla_col.label('timestamp')
+            dttm_col = cols[granularity]
+            dttm_expr = dttm_col.sqla_col.label('timestamp')
            timestamp = dttm_expr

            # Transforming time grain into an expression based on configuration
@ -677,18 +667,20 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
                select_exprs += [timestamp_grain]
                groupby_exprs += [timestamp_grain]

-            tf = '%Y-%m-%d %H:%M:%S.%f'
+            outer_from = text(dttm_col.dttm_sql_literal(from_dttm))
+            outer_to = text(dttm_col.dttm_sql_literal(to_dttm))
+
            time_filter = [
-                timestamp >= text(self.database.dttm_converter(from_dttm)),
-                timestamp <= text(self.database.dttm_converter(to_dttm)),
+                timestamp >= outer_from,
+                timestamp <= outer_to,
            ]
            inner_time_filter = copy(time_filter)
            if inner_from_dttm:
                inner_time_filter[0] = timestamp >= text(
-                    self.database.dttm_converter(inner_from_dttm))
+                    dttm_col.dttm_sql_literal(inner_from_dttm))
            if inner_to_dttm:
                inner_time_filter[1] = timestamp <= text(
-                    self.database.dttm_converter(inner_to_dttm))
+                    dttm_col.dttm_sql_literal(inner_to_dttm))
        else:
            inner_time_filter = []

@ -909,6 +901,8 @@ class TableColumn(Model, AuditMixinNullable):
    filterable = Column(Boolean, default=False)
    expression = Column(Text, default='')
    description = Column(Text, default='')
+    python_date_format = Column(String(255))
+    database_expression = Column(String(255))

    num_types = ('DOUBLE', 'FLOAT', 'INT', 'BIGINT', 'LONG')
    date_types = ('DATE', 'TIME')
@ -938,6 +932,39 @@ class TableColumn(Model, AuditMixinNullable):
            col = literal_column(self.expression).label(name)
        return col

+    def dttm_sql_literal(self, dttm):
+        """Convert datetime object to string
+
+        If datebase_expression is empty, the internal dttm
+        will be parsed as the string with the pattern that
+        user input (python_date_format)
+        If database_expression is not empty, the internal dttm
+        will be parsed as the sql sentence for datebase to convert
+        """
+        tf = self.python_date_format or '%Y-%m-%d %H:%M:%S.%f'
+        if self.database_expression:
+            return self.database_expression.format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
+        elif tf == 'epoch_s':
+            return str((dttm - datetime(1970, 1, 1)).total_seconds())
+        elif tf == 'epoch_ms':
+            return str((dttm - datetime(1970, 1, 1)).total_seconds()*1000.0)
+        else:
+            default = "'{}'".format(dttm.strftime(tf))
+            iso = dttm.isoformat()
+            d = {
+                'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso),  # untested
+                'mysql': default,
+                'oracle':
+                    """TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
+                        dttm.isoformat()),
+                'presto': default,
+                'sqlite': default,
+            }
+            for k, v in d.items():
+                if self.table.database.sqlalchemy_uri.startswith(k):
+                    return v
+            return default
+

 class DruidCluster(Model, AuditMixinNullable):

--- a/caravel/views.py
+++ b/caravel/views.py
@ -187,7 +187,7 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView):  # noqa
    edit_columns = [
        'column_name', 'verbose_name', 'description', 'groupby', 'filterable',
        'table', 'count_distinct', 'sum', 'min', 'max', 'expression',
-        'is_dttm', ]
+        'is_dttm', 'python_date_format', 'database_expression']
    add_columns = edit_columns
    list_columns = [
        'column_name', 'type', 'groupby', 'filterable', 'count_distinct',
@ -201,6 +201,24 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView):  # noqa
        'expression': utils.markdown(
            "a valid SQL expression as supported by the underlying backend. "
            "Example: `substr(name, 1, 1)`", True),
+        'python_date_format': utils.markdown(Markup(
+            "The pattern of timestamp format, use "
+            "<a href='https://docs.python.org/2/library/"
+            "datetime.html#strftime-strptime-behavior'>"
+            "python datetime string pattern</a> "
+            "expression. If time is stored in epoch "
+            "format, put `epoch_s` or `epoch_ms`. Leave `Database Expression` "
+            "below empty if timestamp is stored in "
+            "String or Integer(epoch) type"), True),
+        'database_expression': utils.markdown(
+            "The database expression to cast internal datetime "
+            "constants to database date/timestamp type according to the DBAPI. "
+            "The expression should follow the pattern of "
+            "%Y-%m-%d %H:%M:%S, based on different DBAPI. "
+            "The string should be a python string formatter \n"
+            "`Ex: TO_DATE('{}', 'YYYY-MM-DD HH24:MI:SS')` for Oracle"
+            "Caravel uses default expression based on DB URI if this "
+            "field is blank.", True),
    }
    label_columns = {
        'column_name': _("Column"),
@ -215,6 +233,8 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView):  # noqa
        'max': _("Max"),
        'expression': _("Expression"),
        'is_dttm': _("Is temporal"),
+        'python_date_format': _("Datetime Format"),
+        'database_expression': _("Database Expression")
    }
 appbuilder.add_view_no_menu(TableColumnInlineView)

@ -388,7 +408,8 @@ class TableModelView(CaravelModelView, DeleteMixin):  # noqa
        'table_name', 'database', 'schema',
        'default_endpoint', 'offset', 'cache_timeout']
    edit_columns = [
-        'table_name', 'is_featured', 'database', 'schema', 'description', 'owner',
+        'table_name', 'is_featured', 'database', 'schema',
+        'description', 'owner',
        'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout']
    related_views = [TableColumnInlineView, SqlMetricInlineView]
    base_order = ('changed_on', 'desc')
--- a/caravel/viz.py
+++ b/caravel/viz.py
@ -146,15 +146,34 @@ class BaseViz(object):
        self.error_msg = ""
        self.results = None

+        timestamp_format = None
+        if self.datasource.type == 'table':
+            dttm_col = self.datasource.get_col(query_obj['granularity'])
+            if dttm_col:
+                timestamp_format = dttm_col.python_date_format
+
        # The datasource here can be different backend but the interface is common
        self.results = self.datasource.query(**query_obj)
        self.query = self.results.query
        df = self.results.df
+        # Transform the timestamp we received from database to pandas supported
+        # datetime format. If no python_date_format is specified, the pattern will
+        # be considered as the default ISO date format
+        # If the datetime format is unix, the parse will use the corresponding
+        # parsing logic.
        if df is None or df.empty:
            raise Exception("No data, review your incantations!")
        else:
            if 'timestamp' in df.columns:
-                df.timestamp = pd.to_datetime(df.timestamp, utc=False)
+                if timestamp_format == "epoch_s":
+                    df.timestamp = pd.to_datetime(
+                        df.timestamp, utc=False, unit="s")
+                elif timestamp_format == "epoch_ms":
+                    df.timestamp = pd.to_datetime(
+                        df.timestamp, utc=False, unit="ms")
+                else:
+                    df.timestamp = pd.to_datetime(
+                        df.timestamp, utc=False, format=timestamp_format)
                if self.datasource.offset:
                    df.timestamp += timedelta(hours=self.datasource.offset)
        df.replace([np.inf, -np.inf], np.nan)