superset/caravel/dataframe.py

""" Caravel wrapper around pandas.DataFrame.

TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
                 dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
TODO(bkyryliuk): recognize integer encoded enums.

"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import pandas as pd
import numpy as np


INFER_COL_TYPES_THRESHOLD = 95
INFER_COL_TYPES_SAMPLE_SIZE = 100


# http://pandas.pydata.org/pandas-docs/stable/internals.html#
# subclassing-pandas-data-structures
class CaravelDataFrame(object):
    def __init__(self, df):
        self.__df = df.where((pd.notnull(df)), None)

    @property
    def size(self):
        return len(self.__df.index)

    @property
    def data(self):
        return self.__df.to_dict(orient='records')

    @property
    def columns_dict(self):
        """Provides metadata about columns for data visualization.

        :return: dict, with the fields name, type, is_date, is_dim and agg.
        """
        if self.__df.empty:
            return None

        columns = []
        sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
        sample = self.__df
        if sample_size:
            sample = self.__df.sample(sample_size)
        for col in self.__df.dtypes.keys():
            column = {
                'name': col,
                'type': self.__df.dtypes[col].name,
                'is_date': is_date(self.__df.dtypes[col]),
                'is_dim': is_dimension(self.__df.dtypes[col], col),
            }
            agg = agg_func(self.__df.dtypes[col], col)
            if agg_func:
                column['agg'] = agg

            if column['type'] == 'object':
                # check if encoded datetime
                if (datetime_conversion_rate(sample[col]) >
                        INFER_COL_TYPES_THRESHOLD):
                    column.update({
                        'type': 'datetime_string',
                        'is_date': True,
                        'is_dim': False,
                        'agg': None
                    })
            # 'agg' is optional attribute
            if not column['agg']:
                column.pop('agg', None)
            columns.append(column)

        return columns


# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
    success = 0
    total = 0
    for value in data_series:
        total = total + 1
        try:
            pd.to_datetime(value)
            success = success + 1
        except Exception:
            continue
    return 100 * success / total


def is_date(dtype):
    return dtype.name.startswith('datetime')


def is_dimension(dtype, column_name):
    if is_id(column_name):
        return False
    return dtype == np.object or dtype == np.bool


def is_id(column_name):
    return column_name.startswith('id') or column_name.endswith('id')


def agg_func(dtype, column_name):
    # consider checking for key substring too.
    if is_id(column_name):
        return 'count_distinct'
    if np.issubdtype(dtype, np.number):
        return 'sum'
    return None