datetime format and database expression on column level (#652)

* time format minor features added * add description for datetime format input * db version bug walkaround * removed unecessary comments and fixed minor bug * fixed code style * minor fix * fixed missing time format column in DruidDatasource * Update models.py Minor style fix * Revert "Update models.py" This reverts commit 6897c388e0. * removed timestamp_format from druid and removed try catch in migration * Using spaces, not tabs * get the most updated migration and add the migration on the head of it * remove vscode setting file * use colunm based dttm_format * modify dttm_converter * modify datetime viz * added comments and documents * fixed some description and removed unnecessary import * fix migration head * minor style * minor style * deleted empty lines * delete print statement * add epoch converter * error fixed * fixed epoch parsing issue * delete unnecessary lines * fixed typo * fix minor error * fix styling issues * fix styling error * fixed typo * support epoch_ms and did some refactoring * fixed styling error * fixed styling error * add one more dataset to test dttm_format and db_expr * add more slices * styling * specified String() lenght
2024-09-12 08:39:45 -04:00 · 2016-06-27 21:33:44 -07:00 · 2016-06-27 21:33:44 -07:00 · 7a7f61a296
commit 7a7f61a296
parent 3e742c74bb
7 changed files with 203 additions and 28 deletions
--- a/caravel/bin/caravel
+++ b/caravel/bin/caravel
@ -97,6 +97,9 @@ def load_examples(load_test_data):
    print("Loading [Random long/lat data]")
    data.load_long_lat_data()
    print("Loading [Multiformat time series]")
    data.load_multiformat_time_series_data()
    if load_test_data:
        print("Loading [Unicode test data]")
        data.load_unicode_test_data()
--- a/caravel/data/init.py
+++ b/caravel/data/init.py
@ -12,7 +12,7 @@ import datetime
 import random
 import pandas as pd
-from sqlalchemy import String, DateTime, Date, Float
+from sqlalchemy import String, DateTime, Date, Float, BigInteger
 from caravel import app, db, models, utils
@ -1020,3 +1020,84 @@ def load_long_lat_data():
        params=get_slice_json(slice_data),
    )
    merge_slice(slc)
 def load_multiformat_time_series_data():
    """Loading time series data from a zip file in the repo"""
    with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
        pdf = pd.read_json(f)
    pdf.ds = pd.to_datetime(pdf.ds, unit='s')
    pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
    pdf.to_sql(
        'multiformat_time_series',
        db.engine,
        if_exists='replace',
        chunksize=500,
        dtype={
            "ds": Date,
            'ds2': DateTime,
            "epoch_s": BigInteger,
            "epoch_ms": BigInteger,
            "string0": String(100),
            "string1": String(100),
            "string2": String(100),
            "string3": String(100),
        },
        index=False)
    print("Done loading table!")
    print("-" * 80)
    print("Creating table [multiformat_time_series] reference")
    obj = db.session.query(TBL).filter_by(table_name='multiformat_time_series').first()
    if not obj:
        obj = TBL(table_name='multiformat_time_series')
    obj.main_dttm_col = 'ds'
    obj.database = get_or_create_db(db.session)
    obj.is_featured = False
    dttm_and_expr_dict = {
        'ds': [None, None],
        'ds2': [None, None],
        'epoch_s': ['epoch_s', None],
        'epoch_ms': ['epoch_ms', None],
        'string2': ['%Y%m%d-%H%M%S', None],
        'string1': ['%Y-%m-%d^%H:%M:%S', None],
        'string0': ['%Y-%m-%d %H:%M:%S.%f', None],
        'string3': ['%Y/%m/%d%H:%M:%S.%f', None],
    }
    for col in obj.table_columns:
        print(col.column_name)
        dttm_and_expr = dttm_and_expr_dict[col.column_name]
        col.python_date_format = dttm_and_expr[0]
        col.dbatabase_expr = dttm_and_expr[1]
    db.session.merge(obj)
    db.session.commit()
    obj.fetch_metadata()
    tbl = obj
    print("Creating some slices")
    i = 0
    for col in tbl.table_columns:
        slice_data = {
            "granularity_sqla": col.column_name,
            "datasource_id": "8",
            "datasource_name": "multiformat_time_series",
            "datasource_type": "table",
            "granularity": "day",
            "row_limit": config.get("ROW_LIMIT"),
            "since": "1 year ago",
            "until": "now",
            "where": "",
            "viz_type": "cal_heatmap",
            "domain_granularity": "month",
            "subdomain_granularity": "day",
        }
        slc = Slice(
            slice_name="Calendar Heatmap multiformat" + str(i),
            viz_type='cal_heatmap',
            datasource_type='table',
            table=tbl,
            params=get_slice_json(slice_data),
        )
        i += 1
        merge_slice(slc)
--- a/caravel/data/multiformat_time_series.json.gz
+++ b/caravel/data/multiformat_time_series.json.gz
--- a/caravel/migrations/versions/960c69cb1f5b_.py
+++ b/caravel/migrations/versions/960c69cb1f5b_.py
@ -0,0 +1,24 @@
 """add dttm_format related fields in table_columns
 Revision ID: 960c69cb1f5b
 Revises: d8bc074f7aad
 Create Date: 2016-06-16 14:15:19.573183
 """
 # revision identifiers, used by Alembic.
 revision = '960c69cb1f5b'
 down_revision = 'd8bc074f7aad'
 from alembic import op
 import sqlalchemy as sa
 def upgrade():
    op.add_column('table_columns', sa.Column('python_date_format', sa.String(length=255), nullable=True))
    op.add_column('table_columns', sa.Column('database_expression', sa.String(length=255), nullable=True))
 def downgrade():
    op.drop_column('table_columns', 'python_date_format')
    op.drop_column('table_columns', 'database_expression')
--- a/caravel/models.py
+++ b/caravel/models.py
@ -445,24 +445,6 @@ class Database(Model, AuditMixinNullable):
            if self.sqlalchemy_uri.startswith(db_type):
                return grains
    def dttm_converter(self, dttm):
        """Returns a string that the database flavor understands as a date"""
        default = "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S.%f'))
        iso = dttm.isoformat()
        d = {
            'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), #untested
            'mysql': default,
            'oracle':
                """TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
                    dttm.isoformat()),
            'presto': default,
            'sqlite': default,
        }
        for k, v in d.items():
            if self.sqlalchemy_uri.startswith(k):
                return v
        return default
    def grains_dict(self):
        return {grain.name: grain for grain in self.grains()}
@ -525,6 +507,7 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
    offset = Column(Integer, default=0)
    cache_timeout = Column(Integer)
    schema = Column(String(255))
    table_columns = relationship("TableColumn", back_populates="table")
    baselink = "tablemodelview"
@ -607,6 +590,12 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
    def sql_link(self):
        return '<a href="{}">SQL</a>'.format(self.sql_url)
    def get_col(self, col_name):
        columns = self.table_columns
        for col in columns:
            if col_name == col.column_name:
                return col
    def query(  # sqla
            self, groupby, metrics,
            granularity,
@ -661,7 +650,8 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
            metrics_exprs = []
        if granularity:
-            dttm_expr = cols[granularity].sqla_col.label('timestamp')
+            dttm_col = cols[granularity]
            dttm_expr = dttm_col.sqla_col.label('timestamp')
            timestamp = dttm_expr
            # Transforming time grain into an expression based on configuration
@ -677,18 +667,20 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
                select_exprs += [timestamp_grain]
                groupby_exprs += [timestamp_grain]
-            tf = '%Y-%m-%d %H:%M:%S.%f'
+            outer_from = text(dttm_col.dttm_sql_literal(from_dttm))
            outer_to = text(dttm_col.dttm_sql_literal(to_dttm))
            time_filter = [
-                timestamp >= text(self.database.dttm_converter(from_dttm)),
+                timestamp >= outer_from,
-                timestamp <= text(self.database.dttm_converter(to_dttm)),
+                timestamp <= outer_to,
            ]
            inner_time_filter = copy(time_filter)
            if inner_from_dttm:
                inner_time_filter[0] = timestamp >= text(
-                    self.database.dttm_converter(inner_from_dttm))
+                    dttm_col.dttm_sql_literal(inner_from_dttm))
            if inner_to_dttm:
                inner_time_filter[1] = timestamp <= text(
-                    self.database.dttm_converter(inner_to_dttm))
+                    dttm_col.dttm_sql_literal(inner_to_dttm))
        else:
            inner_time_filter = []
@ -909,6 +901,8 @@ class TableColumn(Model, AuditMixinNullable):
    filterable = Column(Boolean, default=False)
    expression = Column(Text, default='')
    description = Column(Text, default='')
    python_date_format = Column(String(255))
    database_expression = Column(String(255))
    num_types = ('DOUBLE', 'FLOAT', 'INT', 'BIGINT', 'LONG')
    date_types = ('DATE', 'TIME')
@ -938,6 +932,39 @@ class TableColumn(Model, AuditMixinNullable):
            col = literal_column(self.expression).label(name)
        return col
    def dttm_sql_literal(self, dttm):
        """Convert datetime object to string
        If datebase_expression is empty, the internal dttm
        will be parsed as the string with the pattern that
        user input (python_date_format)
        If database_expression is not empty, the internal dttm
        will be parsed as the sql sentence for datebase to convert
        """
        tf = self.python_date_format or '%Y-%m-%d %H:%M:%S.%f'
        if self.database_expression:
            return self.database_expression.format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
        elif tf == 'epoch_s':
            return str((dttm - datetime(1970, 1, 1)).total_seconds())
        elif tf == 'epoch_ms':
            return str((dttm - datetime(1970, 1, 1)).total_seconds()*1000.0)
        else:
            default = "'{}'".format(dttm.strftime(tf))
            iso = dttm.isoformat()
            d = {
                'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso),  # untested
                'mysql': default,
                'oracle':
                    """TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
                        dttm.isoformat()),
                'presto': default,
                'sqlite': default,
            }
            for k, v in d.items():
                if self.table.database.sqlalchemy_uri.startswith(k):
                    return v
            return default
 class DruidCluster(Model, AuditMixinNullable):
--- a/caravel/views.py
+++ b/caravel/views.py
@ -187,7 +187,7 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView):  # noqa
    edit_columns = [
        'column_name', 'verbose_name', 'description', 'groupby', 'filterable',
        'table', 'count_distinct', 'sum', 'min', 'max', 'expression',
-        'is_dttm', ]
+        'is_dttm', 'python_date_format', 'database_expression']
    add_columns = edit_columns
    list_columns = [
        'column_name', 'type', 'groupby', 'filterable', 'count_distinct',
@ -201,6 +201,24 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView):  # noqa
        'expression': utils.markdown(
            "a valid SQL expression as supported by the underlying backend. "
            "Example: `substr(name, 1, 1)`", True),
        'python_date_format': utils.markdown(Markup(
            "The pattern of timestamp format, use "
            "<a href='https://docs.python.org/2/library/"
            "datetime.html#strftime-strptime-behavior'>"
            "python datetime string pattern</a> "
            "expression. If time is stored in epoch "
            "format, put `epoch_s` or `epoch_ms`. Leave `Database Expression` "
            "below empty if timestamp is stored in "
            "String or Integer(epoch) type"), True),
        'database_expression': utils.markdown(
            "The database expression to cast internal datetime "
            "constants to database date/timestamp type according to the DBAPI. "
            "The expression should follow the pattern of "
            "%Y-%m-%d %H:%M:%S, based on different DBAPI. "
            "The string should be a python string formatter \n"
            "`Ex: TO_DATE('{}', 'YYYY-MM-DD HH24:MI:SS')` for Oracle"
            "Caravel uses default expression based on DB URI if this "
            "field is blank.", True),
    }
    label_columns = {
        'column_name': _("Column"),
@ -215,6 +233,8 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView):  # noqa
        'max': _("Max"),
        'expression': _("Expression"),
        'is_dttm': _("Is temporal"),
        'python_date_format': _("Datetime Format"),
        'database_expression': _("Database Expression")
    }
 appbuilder.add_view_no_menu(TableColumnInlineView)
@ -388,7 +408,8 @@ class TableModelView(CaravelModelView, DeleteMixin):  # noqa
        'table_name', 'database', 'schema',
        'default_endpoint', 'offset', 'cache_timeout']
    edit_columns = [
-        'table_name', 'is_featured', 'database', 'schema', 'description', 'owner',
+        'table_name', 'is_featured', 'database', 'schema',
        'description', 'owner',
        'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout']
    related_views = [TableColumnInlineView, SqlMetricInlineView]
    base_order = ('changed_on', 'desc')
--- a/caravel/viz.py
+++ b/caravel/viz.py
@ -146,15 +146,34 @@ class BaseViz(object):
        self.error_msg = ""
        self.results = None
        timestamp_format = None
        if self.datasource.type == 'table':
            dttm_col = self.datasource.get_col(query_obj['granularity'])
            if dttm_col:
                timestamp_format = dttm_col.python_date_format
        # The datasource here can be different backend but the interface is common
        self.results = self.datasource.query(**query_obj)
        self.query = self.results.query
        df = self.results.df
        # Transform the timestamp we received from database to pandas supported
        # datetime format. If no python_date_format is specified, the pattern will
        # be considered as the default ISO date format
        # If the datetime format is unix, the parse will use the corresponding
        # parsing logic.
        if df is None or df.empty:
            raise Exception("No data, review your incantations!")
        else:
            if 'timestamp' in df.columns:
-                df.timestamp = pd.to_datetime(df.timestamp, utc=False)
+                if timestamp_format == "epoch_s":
                    df.timestamp = pd.to_datetime(
                        df.timestamp, utc=False, unit="s")
                elif timestamp_format == "epoch_ms":
                    df.timestamp = pd.to_datetime(
                        df.timestamp, utc=False, unit="ms")
                else:
                    df.timestamp = pd.to_datetime(
                        df.timestamp, utc=False, format=timestamp_format)
                if self.datasource.offset:
                    df.timestamp += timedelta(hours=self.datasource.offset)
        df.replace([np.inf, -np.inf], np.nan)