Making thrift, pyhive and tableschema as extra_requires (#6696)

* Making thrift, pyhive and tableschema as extra_requires Looking at the dependency tree for license related questions, I noticed that tableschema had a huge tree, and only people running Hive really need it. Making this as well as pyhive and thrift optional. Also bumping some python dependencies * Run pip-compile * Removing refs to past.builtins (from future lib) * Add thrift
2019-01-19 14:27:18 -08:00 · 2019-01-19 14:27:18 -08:00 · f742b9876b
parent ebb799140a
commit f742b9876b
12 changed files with 39 additions and 64 deletions
--- a/UPDATING.md
+++ b/UPDATING.md
@ -3,6 +3,12 @@
 This file documents any backwards-incompatible changes in Superset and
 assists people when migrating to a new version.
 ## Superset 0.32.0
 * If you use `Hive` or `Presto`, we've moved some dependencies that were
  in the main package as optional now. To get these packages,
  run `pip install superset[presto]` and/or `pip install superset[hive]` as
  required.
 ## Superset 0.31.0
 * boto3 / botocore was removed from the dependency list. If you use s3
 as a place to store your SQL Lab result set or Hive uploads, you may
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -9,8 +9,10 @@ mysqlclient==1.3.13
 pip-tools==3.1.0
 psycopg2-binary==2.7.5
 pycodestyle==2.4.0
 pyhive==0.6.1
 pylint==1.9.2
 python-dotenv==0.10.1
 redis==2.10.6
 statsd==3.3.0
 thrift==0.11.0
 tox==3.5.3
--- a/requirements.txt
+++ b/requirements.txt
@ -7,11 +7,9 @@
 alembic==1.0.0            # via flask-migrate
 amqp==2.3.2               # via kombu
 asn1crypto==0.24.0        # via cryptography
-babel==2.6.0              # via flask-babel, flower
+babel==2.6.0              # via flask-babel
 billiard==3.5.0.4         # via celery
 bleach==3.0.2
 cachetools==3.0.0         # via google-auth
 cchardet==1.0.0           # via tabulator
 celery==4.2.0
 certifi==2018.8.24        # via requests
 cffi==1.11.5              # via cryptography
@ -23,7 +21,6 @@ croniter==0.3.26
 cryptography==2.4.2
 decorator==4.3.0          # via retry
 defusedxml==0.5.0         # via python3-openid
 et-xmlfile==1.0.1         # via openpyxl
 flask-appbuilder==1.12.1
 flask-babel==0.11.1       # via flask-appbuilder
 flask-caching==1.4.0
@ -34,67 +31,42 @@ flask-openid==1.2.5       # via flask-appbuilder
 flask-sqlalchemy==2.3.2   # via flask-appbuilder, flask-migrate
 flask-wtf==0.14.2
 flask==1.0.2
 flower==0.9.2
 future==0.16.0            # via pyhive
 geopy==1.11.0
 google-auth==1.6.1        # via gsheetsdb
 gsheetsdb==0.1.9
 gunicorn==19.8.0
 humanize==0.5.1
 idna==2.6
 ijson==2.3                # via tabulator
 isodate==0.6.0
 itsdangerous==0.24        # via flask
 jdcal==1.4                # via openpyxl
 jinja2==2.10              # via flask, flask-babel
 jsonlines==1.2.0          # via tabulator
 jsonschema==2.6.0         # via tableschema
 kombu==4.2.1              # via celery
 linear-tsv==1.1.0         # via tabulator
 mako==1.0.7               # via alembic
 markdown==3.0
 markupsafe==1.0           # via jinja2, mako
 mo-future==2.20.18317     # via moz-sql-parser
 moz-sql-parser==2.19.18318  # via gsheetsdb
 numpy==1.15.2             # via pandas
 openpyxl==2.4.11          # via tabulator
 pandas==0.23.1
 parsedatetime==2.0.0
 pathlib2==2.3.0
 polyline==1.3.2
 py==1.7.0                 # via retry
 pyasn1-modules==0.2.2     # via google-auth
 pyasn1==0.4.4             # via pyasn1-modules, rsa
 pycparser==2.19           # via cffi
 pydruid==0.5.0
 pyhive==0.5.1
 pyparsing==2.3.0          # via moz-sql-parser
 python-dateutil==2.6.1
 python-editor==1.0.3      # via alembic
 python-geohash==0.8.5
 python3-openid==3.1.0     # via flask-openid
-pytz==2018.5              # via babel, celery, flower, pandas
+pytz==2018.5              # via babel, celery, pandas
 pyyaml==3.13
 requests==2.20.0
 retry==0.9.2
 rfc3986==1.1.0            # via tableschema
 rsa==4.0                  # via google-auth
 sasl==0.2.1               # via thrift-sasl
 selenium==3.141.0
 simplejson==3.15.0
-six==1.11.0               # via bleach, cryptography, google-auth, gsheetsdb, isodate, jsonlines, linear-tsv, pathlib2, polyline, pydruid, python-dateutil, sasl, sqlalchemy-utils, tableschema, tabulator, thrift
+six==1.11.0               # via bleach, cryptography, isodate, pathlib2, polyline, pydruid, python-dateutil, sqlalchemy-utils
 sqlalchemy-utils==0.32.21
 sqlalchemy==1.2.2
 sqlparse==0.2.4
 tableschema==1.1.0
 tabulator==1.15.0         # via tableschema
 thrift-sasl==0.3.0
 thrift==0.11.0
 tornado==5.1.1            # via flower
 unicodecsv==0.14.1
 urllib3==1.22             # via requests, selenium
 vine==1.1.4               # via amqp
 webencodings==0.5.1       # via bleach
 werkzeug==0.14.1          # via flask
 wtforms==2.2.1            # via flask-wtf
 xlrd==1.1.0               # via tabulator
--- a/setup.py
+++ b/setup.py
@ -82,9 +82,7 @@ setup(
        'flask-compress',
        'flask-migrate',
        'flask-wtf',
        'flower',  # deprecated
        'geopy',
        'gsheetsdb>=0.1.9',
        'gunicorn',  # deprecated
        'humanize',
        'idna',
@ -95,7 +93,6 @@ setup(
        'pathlib2',
        'polyline',
        'pydruid>=0.4.3',
        'pyhive>=0.4.0',
        'python-dateutil',
        'python-geohash',
        'pyyaml>=3.13',
@ -106,14 +103,19 @@ setup(
        'sqlalchemy',
        'sqlalchemy-utils',
        'sqlparse',
        'tableschema',
        'thrift>=0.9.3',
        'thrift-sasl>=0.2.1',
        'unicodecsv',
    ],
    extras_require={
        'cors': ['flask-cors>=2.0.0'],
        'console_log': ['console_log==0.2.10'],
        'hive': [
            'pyhive>=0.4.0',
            'tableschema',
            'thrift-sasl>=0.2.1',
            'thrift>=0.9.3',
        ],
        'presto': ['pyhive>=0.4.0'],
        'gsheets': ['gsheetsdb>=0.1.9'],
    },
    author='Apache Software Foundation',
    author_email='dev@superset.incubator.apache.org',
--- a/superset/connectors/base/models.py
+++ b/superset/connectors/base/models.py
@ -17,7 +17,6 @@
 # pylint: disable=C,R,W
 import json
 from past.builtins import basestring
 from sqlalchemy import (
    and_, Boolean, Column, Integer, String, Text,
 )
@ -218,7 +217,7 @@ class BaseDatasource(AuditMixinNullable, ImportMixin):
            values, target_column_is_numeric=False, is_list_target=False):
        def handle_single_value(v):
            # backward compatibility with previous <select> components
-            if isinstance(v, basestring):
+            if isinstance(v, str):
                v = v.strip('\t\n \'"')
                if target_column_is_numeric:
                    # For backwards compatibility and edge cases
--- a/superset/connectors/sqla/views.py
+++ b/superset/connectors/sqla/views.py
@ -23,7 +23,6 @@ from flask_appbuilder.models.sqla.interface import SQLAInterface
 from flask_appbuilder.security.decorators import has_access
 from flask_babel import gettext as __
 from flask_babel import lazy_gettext as _
 from past.builtins import basestring
 from superset import appbuilder, db, security_manager
 from superset.connectors.base.views import DatasourceModelView
@ -301,7 +300,7 @@ class TableModelView(DatasourceModelView, DeleteMixin, YamlExportMixin):  # noqa
    def edit(self, pk):
        """Simple hack to redirect to explore view after saving"""
        resp = super(TableModelView, self).edit(pk)
-        if isinstance(resp, basestring):
+        if isinstance(resp, str):
            return resp
        return redirect('/superset/explore/table/{}/'.format(pk))
--- a/superset/dataframe.py
+++ b/superset/dataframe.py
@ -29,7 +29,6 @@ import numpy as np
 import pandas as pd
 from pandas.core.common import _maybe_box_datetimelike
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from past.builtins import basestring
 from superset.utils.core import JS_MAX_INTEGER
@ -144,7 +143,7 @@ class SupersetDataFrame(object):
    def is_date(np_dtype, db_type_str):
        def looks_daty(s):
-            if isinstance(s, basestring):
+            if isinstance(s, str):
                return any([s.lower().startswith(ss) for ss in ('time', 'date')])
            return False
@ -203,7 +202,7 @@ class SupersetDataFrame(object):
            if not db_type_str or db_type_str.upper() == 'OBJECT':
                v = sample[col].iloc[0] if not sample[col].empty else None
-                if isinstance(v, basestring):
+                if isinstance(v, str):
                    column['type'] = 'STRING'
                elif isinstance(v, int):
                    column['type'] = 'INT'
--- a/superset/db_engine_specs.py
+++ b/superset/db_engine_specs.py
@ -40,7 +40,6 @@ import time
 from flask import g
 from flask_babel import lazy_gettext as _
 import pandas
 from past.builtins import basestring
 import sqlalchemy as sqla
 from sqlalchemy import Column, select
 from sqlalchemy.engine import create_engine
@ -48,7 +47,6 @@ from sqlalchemy.engine.url import make_url
 from sqlalchemy.sql import quoted_name, text
 from sqlalchemy.sql.expression import TextAsFrom
 import sqlparse
 from tableschema import Table
 from werkzeug.utils import secure_filename
 from superset import app, conf, db, sql_parse
@ -143,7 +141,7 @@ class BaseEngineSpec(object):
    @classmethod
    def get_datatype(cls, type_code):
-        if isinstance(type_code, basestring) and len(type_code):
+        if isinstance(type_code, str) and len(type_code):
            return type_code.upper()
    @classmethod
@ -709,7 +707,7 @@ class MySQLEngineSpec(BaseEngineSpec):
        datatype = type_code
        if isinstance(type_code, int):
            datatype = cls.type_code_map.get(type_code)
-        if datatype and isinstance(datatype, basestring) and len(datatype):
+        if datatype and isinstance(datatype, str) and len(datatype):
            return datatype
    @classmethod
@ -1123,6 +1121,8 @@ class HiveEngineSpec(PrestoEngineSpec):
        upload_path = config['UPLOAD_FOLDER'] + \
            secure_filename(filename)
        # Optional dependency
        from tableschema import Table  # pylint: disable=import-error
        hive_table_schema = Table(upload_path).infer()
        column_name_and_type = []
        for column_info in hive_table_schema['fields']:
--- a/superset/db_engines/hive.py
+++ b/superset/db_engines/hive.py
@ -15,14 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=C,R,W
 from pyhive import hive  # pylint: disable=no-name-in-module
 from TCLIService import ttypes
 from thrift import Thrift
 # TODO: contribute back to pyhive.
 def fetch_logs(self, max_rows=1024,
-               orientation=ttypes.TFetchOrientation.FETCH_NEXT):
+               orientation=None):
    """Mocked. Retrieve the logs produced by the execution of the query.
    Can be called multiple times to fetch the logs produced after
    the previous call.
@ -31,6 +28,10 @@ def fetch_logs(self, max_rows=1024,
    .. note::
        This is not a part of DB-API.
    """
    from pyhive import hive
    from TCLIService import ttypes
    from thrift import Thrift
    orientation = orientation or ttypes.TFetchOrientation.FETCH_NEXT
    try:
        req = ttypes.TGetLogReq(operationHandle=self._operationHandle)
        logs = self._connection.client.GetLog(req).log
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=C,R,W
 """Utility functions used across Superset"""
 from builtins import object
 from datetime import date, datetime, time, timedelta
 import decimal
 from email.mime.application import MIMEApplication
@ -48,7 +47,6 @@ import markdown as md
 import numpy
 import pandas as pd
 import parsedatetime
 from past.builtins import basestring
 from pydruid.utils.having import Having
 import sqlalchemy as sa
 from sqlalchemy import event, exc, select, Text
@ -88,7 +86,7 @@ def flasher(msg, severity=None):
            logging.info(msg)
-class _memoized(object):  # noqa
+class _memoized:  # noqa
    """Decorator that caches a function's return value each time it is called
    If called later with the same arguments, the cached value is returned, and
@ -503,7 +501,7 @@ def table_has_constraint(table, name, db):
    return False
-class timeout(object):
+class timeout:
    """
    To be used in a ``with`` block and timeout its content.
    """
@ -569,7 +567,7 @@ def pessimistic_connection_handling(some_engine):
            connection.should_close_with_result = save_should_close_with_result
-class QueryStatus(object):
+class QueryStatus:
    """Enum-type class for query statuses"""
    STOPPED = 'stopped'
@ -678,7 +676,7 @@ def send_MIME_email(e_from, e_to, mime_msg, config, dryrun=False):
 def get_email_address_list(address_string):
-    if isinstance(address_string, basestring):
+    if isinstance(address_string, str):
        if ',' in address_string:
            address_string = address_string.split(',')
        elif '\n' in address_string:
--- a/superset/viz.py
+++ b/superset/viz.py
@ -43,7 +43,6 @@ from markdown import markdown
 import numpy as np
 import pandas as pd
 from pandas.tseries.frequencies import to_offset
 from past.builtins import basestring
 import polyline
 import simplejson as json
@ -1612,8 +1611,8 @@ class SankeyViz(BaseViz):
    def get_data(self, df):
        df.columns = ['source', 'target', 'value']
-        df['source'] = df['source'].astype(basestring)
+        df['source'] = df['source'].astype(str)
-        df['target'] = df['target'].astype(basestring)
+        df['target'] = df['target'].astype(str)
        recs = df.to_dict(orient='records')
        hierarchy = defaultdict(set)
--- a/tests/celery_tests.py
+++ b/tests/celery_tests.py
@ -20,8 +20,6 @@ import subprocess
 import time
 import unittest
 from past.builtins import basestring
 from superset import app, db
 from superset.models.helpers import QueryStatus
 from superset.models.sql_lab import Query
@ -239,7 +237,7 @@ class CeleryTestCase(SupersetTestCase):
    @staticmethod
    def de_unicode_dict(d):
        def str_if_basestring(o):
-            if isinstance(o, basestring):
+            if isinstance(o, str):
                return str(o)
            return o
        return {str_if_basestring(k): str_if_basestring(d[k]) for k in d}