Upload excel (#9825)

This commit is contained in:
Hossein Torabi 2020-07-03 09:58:30 +04:30 committed by GitHub
parent cf60f664a4
commit fdd28c1a5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 465 additions and 26 deletions

View File

@ -252,6 +252,8 @@ If you run a production system you should schedule downtime for this
upgrade.
The PRs bellow have more information around the breaking changes:
* [9825](https://github.com/apache/incubator-superset/pull/9825): Support for Excel sheet upload added. To enable support, install Superset with the optional dependency `excel`
* [4587](https://github.com/apache/incubator-superset/pull/4587) : a backward
incompatible database migration that requires downtime. Once the
db migration succeeds, the web server needs to be restarted with the

View File

@ -100,4 +100,4 @@ yarl==1.4.2 # via aiohttp
zipp==3.1.0 # via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools
# setuptools

View File

@ -123,6 +123,7 @@ setup(
"dremio": ["sqlalchemy_dremio>=1.1.0"],
"cockroachdb": ["cockroachdb==0.3.3"],
"thumbnails": ["Pillow>=7.0.0, <8.0.0"],
"excel": ["xlrd>=1.2.0, <1.3"],
},
python_requires="~=3.6",
author="Apache Software Foundation",

View File

@ -159,7 +159,11 @@ class SupersetAppInitializer:
DashboardModelViewAsync,
)
from superset.views.database.api import DatabaseRestApi
from superset.views.database.views import DatabaseView, CsvToDatabaseView
from superset.views.database.views import (
DatabaseView,
CsvToDatabaseView,
ExcelToDatabaseView,
)
from superset.views.datasource import Datasource
from superset.views.log.api import LogRestApi
from superset.views.log.views import LogModelView
@ -265,6 +269,7 @@ class SupersetAppInitializer:
appbuilder.add_view_no_menu(Api)
appbuilder.add_view_no_menu(CssTemplateAsyncModelView)
appbuilder.add_view_no_menu(CsvToDatabaseView)
appbuilder.add_view_no_menu(ExcelToDatabaseView)
appbuilder.add_view_no_menu(Dashboard)
appbuilder.add_view_no_menu(DashboardModelViewAsync)
appbuilder.add_view_no_menu(Datasource)
@ -324,15 +329,35 @@ class SupersetAppInitializer:
category="SQL Lab",
category_label=__("SQL Lab"),
)
appbuilder.add_link(
"Upload a CSV",
label=__("Upload a CSV"),
href="/csvtodatabaseview/form",
icon="fa-upload",
category="Sources",
category_label=__("Sources"),
category_icon="fa-wrench",
)
if self.config["CSV_EXTENSIONS"].intersection(
self.config["ALLOWED_EXTENSIONS"]
):
appbuilder.add_link(
"Upload a CSV",
label=__("Upload a CSV"),
href="/csvtodatabaseview/form",
icon="fa-upload",
category="Sources",
category_label=__("Sources"),
category_icon="fa-wrench",
)
try:
import xlrd # pylint: disable=unused-import
if self.config["EXCEL_EXTENSIONS"].intersection(
self.config["ALLOWED_EXTENSIONS"]
):
appbuilder.add_link(
"Upload Excel",
label=__("Upload Excel"),
href="/exceltodatabaseview/form",
icon="fa-upload",
category="Sources",
category_label=__("Sources"),
category_icon="fa-wrench",
)
except ImportError:
pass
#
# Conditionally setup log views

View File

@ -365,8 +365,9 @@ CORS_OPTIONS: Dict[Any, Any] = {}
SUPERSET_WEBSERVER_DOMAINS = None
# Allowed format types for upload on Database view
# TODO: Add processing of other spreadsheet formats (xls, xlsx etc)
ALLOWED_EXTENSIONS = {"csv", "tsv"}
EXCEL_EXTENSIONS = {"xlsx", "xls"}
CSV_EXTENSIONS = {"csv", "tsv"}
ALLOWED_EXTENSIONS = {*EXCEL_EXTENSIONS, *CSV_EXTENSIONS}
# CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv
# method.

View File

@ -430,6 +430,20 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods
parsed_query = sql_parse.ParsedQuery(sql)
return parsed_query.set_or_update_query_limit(limit)
@staticmethod
def excel_to_df(**kwargs: Any) -> pd.DataFrame:
""" Read excel into Pandas DataFrame
:param kwargs: params to be passed to DataFrame.read_excel
:return: Pandas DataFrame containing data from excel
"""
kwargs["encoding"] = "utf-8"
kwargs["iterator"] = True
chunks = pd.io.excel.read_excel(
io=kwargs["filepath_or_buffer"], sheet_name=kwargs["sheet_name"]
)
df = pd.concat(chunk for chunk in chunks.values())
return df
@staticmethod
def csv_to_df(**kwargs: Any) -> pd.DataFrame:
""" Read csv into Pandas DataFrame
@ -486,6 +500,28 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods
"""
return None
@classmethod
def create_table_from_excel( # pylint: disable=too-many-arguments
cls,
filename: str,
table: Table,
database: "Database",
excel_to_df_kwargs: Dict[str, Any],
df_to_sql_kwargs: Dict[str, Any],
) -> None:
"""
Create table from contents of a excel. Note: this method does not create
metadata for the table.
"""
df = cls.excel_to_df(filepath_or_buffer=filename, **excel_to_df_kwargs,)
engine = cls.get_engine(database)
if table.schema:
# only add schema when it is preset and non empty
df_to_sql_kwargs["schema"] = table.schema
if engine.dialect.supports_multivalues_insert:
df_to_sql_kwargs["method"] = "multi"
cls.df_to_sql(df=df, con=engine, **df_to_sql_kwargs)
@classmethod
def get_all_datasource_names(
cls, database: "Database", datasource_type: str

View File

@ -0,0 +1,64 @@
{#
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
#}
{% extends 'appbuilder/general/model/edit.html' %}
{% block tail_js %}
{{ super() }}
<script>
var db = $("#con");
var schema = $("#schema");
// this element is a text input
// copy it here so it can be reused later
var any_schema_is_allowed = schema.clone();
update_schemas_allowed_for_excel_upload(db.val());
db.change(function(){
update_schemas_allowed_for_excel_upload(db.val());
});
function update_schemas_allowed_for_excel_upload(db_id) {
$.ajax({
method: "GET",
url: "/superset/schemas_access_for_excel_upload",
data: {db_id: db_id},
dataType: 'json',
contentType: "application/json; charset=utf-8"
}).done(function(data) {
change_schema_field_in_formview(data)
}).fail(function(error) {
var errorMsg = error.responseJSON.error;
alert("ERROR: " + errorMsg);
});
}
function change_schema_field_in_formview(schemas_allowed){
if (schemas_allowed && schemas_allowed.length > 0) {
var dropdown_schema_lists = '<select id="schema" name="schema" required>';
schemas_allowed.forEach(function(schema_allowed) {
dropdown_schema_lists += ('<option value="' + schema_allowed + '">' + schema_allowed + '</option>');
});
dropdown_schema_lists += '</select>';
$("#schema").replaceWith(dropdown_schema_lists);
} else {
$("#schema").replaceWith(any_schema_is_allowed)
}
}
</script>
{% endblock %}

View File

@ -91,11 +91,15 @@ class CsvToDatabaseForm(DynamicForm):
validators=[
FileRequired(),
FileAllowed(
config["ALLOWED_EXTENSIONS"],
config["ALLOWED_EXTENSIONS"].intersection(config["CSV_EXTENSIONS"]),
_(
"Only the following file extensions are allowed: "
"%(allowed_extensions)s",
allowed_extensions=", ".join(config["ALLOWED_EXTENSIONS"]),
allowed_extensions=", ".join(
config["ALLOWED_EXTENSIONS"].intersection(
config["CSV_EXTENSIONS"]
)
),
),
),
],
@ -206,3 +210,169 @@ class CsvToDatabaseForm(DynamicForm):
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
class ExcelToDatabaseForm(DynamicForm):
# pylint: disable=E0211
def excel_allowed_dbs(): # type: ignore
excel_allowed_dbs = []
# TODO: change allow_csv_upload to allow_file_upload
excel_enabled_dbs = (
db.session.query(Database).filter_by(allow_csv_upload=True).all()
)
for excel_enabled_db in excel_enabled_dbs:
if ExcelToDatabaseForm.at_least_one_schema_is_allowed(excel_enabled_db):
excel_allowed_dbs.append(excel_enabled_db)
return excel_allowed_dbs
@staticmethod
def at_least_one_schema_is_allowed(database: Database) -> bool:
"""
If the user has access to the database or all datasource
1. if schemas_allowed_for_csv_upload is empty
a) if database does not support schema
user is able to upload excel without specifying schema name
b) if database supports schema
user is able to upload excel to any schema
2. if schemas_allowed_for_csv_upload is not empty
a) if database does not support schema
This situation is impossible and upload will fail
b) if database supports schema
user is able to upload to schema in schemas_allowed_for_csv_upload
elif the user does not access to the database or all datasource
1. if schemas_allowed_for_csv_upload is empty
a) if database does not support schema
user is unable to upload excel
b) if database supports schema
user is unable to upload excel
2. if schemas_allowed_for_csv_upload is not empty
a) if database does not support schema
This situation is impossible and user is unable to upload excel
b) if database supports schema
user is able to upload to schema in schemas_allowed_for_csv_upload
"""
if (
security_manager.database_access(database)
or security_manager.all_datasource_access()
):
return True
schemas = database.get_schema_access_for_csv_upload()
if schemas and security_manager.schemas_accessible_by_user(
database, schemas, False
):
return True
return False
name = StringField(
_("Table Name"),
description=_("Name of table to be created from excel data."),
validators=[DataRequired()],
widget=BS3TextFieldWidget(),
)
excel_file = FileField(
_("Excel File"),
description=_("Select a Excel file to be uploaded to a database."),
validators=[
FileRequired(),
FileAllowed(
config["ALLOWED_EXTENSIONS"].intersection(config["EXCEL_EXTENSIONS"]),
_(
"Only the following file extensions are allowed: "
"%(allowed_extensions)s",
allowed_extensions=", ".join(
config["ALLOWED_EXTENSIONS"].intersection(
config["EXCEL_EXTENSIONS"]
)
),
),
),
],
)
sheet_name = StringField(
_("Sheet Name"), description="Sheet Name", validators=[Optional()]
)
con = QuerySelectField(
_("Database"),
query_factory=excel_allowed_dbs,
get_pk=lambda a: a.id,
get_label=lambda a: a.database_name,
)
schema = StringField(
_("Schema"),
description=_("Specify a schema (if database flavor supports this)."),
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
if_exists = SelectField(
_("Table Exists"),
description=_(
"If table exists do one of the following: "
"Fail (do nothing), Replace (drop and recreate table) "
"or Append (insert data)."
),
choices=[
("fail", _("Fail")),
("replace", _("Replace")),
("append", _("Append")),
],
validators=[DataRequired()],
)
header = IntegerField(
_("Header Row"),
description=_(
"Row containing the headers to use as "
"column names (0 is first line of data). "
"Leave empty if there is no header row."
),
validators=[Optional(), NumberRange(min=0)],
widget=BS3TextFieldWidget(),
)
index_col = IntegerField(
_("Index Column"),
description=_(
"Column to use as the row labels of the "
"dataframe. Leave empty if no index column."
),
validators=[Optional(), NumberRange(min=0)],
widget=BS3TextFieldWidget(),
)
mangle_dupe_cols = BooleanField(
_("Mangle Duplicate Columns"),
description=_('Specify duplicate columns as "X.0, X.1".'),
)
skipinitialspace = BooleanField(
_("Skip Initial Space"), description=_("Skip spaces after delimiter.")
)
skiprows = IntegerField(
_("Skip Rows"),
description=_("Number of rows to skip at start of file."),
validators=[Optional(), NumberRange(min=0)],
widget=BS3TextFieldWidget(),
)
nrows = IntegerField(
_("Rows to Read"),
description=_("Number of rows of file to read."),
validators=[Optional(), NumberRange(min=0)],
widget=BS3TextFieldWidget(),
)
decimal = StringField(
_("Decimal Character"),
default=".",
description=_("Character to interpret as decimal point."),
validators=[Optional(), Length(min=1, max=1)],
widget=BS3TextFieldWidget(),
)
index = BooleanField(
_("Dataframe Index"), description=_("Write dataframe index as a column.")
)
index_label = StringField(
_("Column Label(s)"),
description=_(
"Column label for index column(s). If None is given "
"and Dataframe Index is True, Index Names are used."
),
validators=[Optional()],
widget=BS3TextFieldWidget(),
)

View File

@ -20,9 +20,9 @@ from typing import TYPE_CHECKING
from flask import flash, g, redirect
from flask_appbuilder import SimpleFormView
from flask_appbuilder.forms import DynamicForm
from flask_appbuilder.models.sqla.interface import SQLAInterface
from flask_babel import lazy_gettext as _
from werkzeug.wrappers import Response
from wtforms.fields import StringField
from wtforms.validators import ValidationError
@ -32,12 +32,10 @@ from superset.connectors.sqla.models import SqlaTable
from superset.constants import RouteMethod
from superset.exceptions import CertificateException
from superset.sql_parse import Table
from superset.typing import FlaskResponse
from superset.utils import core as utils
from superset.views.base import DeleteMixin, SupersetModelView, YamlExportMixin
from superset.views.database.forms import CsvToDatabaseForm
from .forms import CsvToDatabaseForm
from .forms import CsvToDatabaseForm, ExcelToDatabaseForm
from .mixins import DatabaseMixin
from .validators import schema_allows_csv_upload, sqlalchemy_uri_validator
@ -48,9 +46,7 @@ config = app.config
stats_logger = config["STATS_LOGGER"]
def sqlalchemy_uri_form_validator( # pylint: disable=unused-argument
form: DynamicForm, field: StringField
) -> None:
def sqlalchemy_uri_form_validator(_: _, field: StringField) -> None:
"""
Check if user has submitted a valid SQLAlchemy URI
"""
@ -58,9 +54,7 @@ def sqlalchemy_uri_form_validator( # pylint: disable=unused-argument
sqlalchemy_uri_validator(field.data, exception=ValidationError)
def certificate_form_validator( # pylint: disable=unused-argument
form: DynamicForm, field: StringField
) -> None:
def certificate_form_validator(_: _, field: StringField) -> None:
"""
Check if user has submitted a valid SSL certificate
"""
@ -116,7 +110,7 @@ class CsvToDatabaseView(SimpleFormView):
form.decimal.data = "."
form.if_exists.data = "fail"
def form_post(self, form: CsvToDatabaseForm) -> FlaskResponse:
def form_post(self, form: CsvToDatabaseForm) -> Response:
database = form.con.data
csv_table = Table(table=form.name.data, schema=form.schema.data)
@ -249,3 +243,149 @@ class CsvToDatabaseView(SimpleFormView):
flash(message, "info")
stats_logger.incr("successful_csv_upload")
return redirect("/tablemodelview/list/")
class ExcelToDatabaseView(SimpleFormView):
form = ExcelToDatabaseForm
form_template = "superset/form_view/excel_to_database_view/edit.html"
form_title = _("Excel to Database configuration")
add_columns = ["database", "schema", "table_name"]
def form_get(self, form: ExcelToDatabaseForm) -> None:
form.header.data = 0
form.mangle_dupe_cols.data = True
form.skipinitialspace.data = False
form.decimal.data = "."
form.if_exists.data = "fail"
form.sheet_name = None
def form_post(self, form: ExcelToDatabaseForm) -> Response:
database = form.con.data
excel_table = Table(table=form.name.data, schema=form.schema.data)
if not schema_allows_csv_upload(database, excel_table.schema):
message = _(
'Database "%(database_name)s" schema "%(schema_name)s" '
"is not allowed for excel uploads. Please contact your Superset Admin.",
database_name=database.database_name,
schema_name=excel_table.schema,
)
flash(message, "danger")
return redirect("/exceltodatabaseview/form")
if "." in excel_table.table and excel_table.schema:
message = _(
"You cannot specify a namespace both in the name of the table: "
'"%(excel_table.table)s" and in the schema field: '
'"%(excel_table.schema)s". Please remove one',
table=excel_table.table,
schema=excel_table.schema,
)
flash(message, "danger")
return redirect("/exceltodatabaseview/form")
uploaded_tmp_file_path = tempfile.NamedTemporaryFile(
dir=app.config["UPLOAD_FOLDER"],
suffix=os.path.splitext(form.excel_file.data.filename)[1].lower(),
delete=False,
).name
try:
utils.ensure_path_exists(config["UPLOAD_FOLDER"])
upload_stream_write(form.excel_file.data, uploaded_tmp_file_path)
con = form.data.get("con")
database = (
db.session.query(models.Database).filter_by(id=con.data.get("id")).one()
)
excel_to_df_kwargs = {
"header": form.header.data if form.header.data else 0,
"index_col": form.index_col.data,
"mangle_dupe_cols": form.mangle_dupe_cols.data,
"skipinitialspace": form.skipinitialspace.data,
"skiprows": form.skiprows.data,
"nrows": form.nrows.data,
"sheet_name": form.sheet_name.data,
"chunksize": 1000,
}
df_to_sql_kwargs = {
"name": excel_table.table,
"if_exists": form.if_exists.data,
"index": form.index.data,
"index_label": form.index_label.data,
"chunksize": 1000,
}
database.db_engine_spec.create_table_from_excel(
uploaded_tmp_file_path,
excel_table,
database,
excel_to_df_kwargs,
df_to_sql_kwargs,
)
# Connect table to the database that should be used for exploration.
# E.g. if hive was used to upload a excel, presto will be a better option
# to explore the table.
expore_database = database
explore_database_id = database.get_extra().get("explore_database_id", None)
if explore_database_id:
expore_database = (
db.session.query(models.Database)
.filter_by(id=explore_database_id)
.one_or_none()
or database
)
sqla_table = (
db.session.query(SqlaTable)
.filter_by(
table_name=excel_table.table,
schema=excel_table.schema,
database_id=expore_database.id,
)
.one_or_none()
)
if sqla_table:
sqla_table.fetch_metadata()
if not sqla_table:
sqla_table = SqlaTable(table_name=excel_table.table)
sqla_table.database = expore_database
sqla_table.database_id = database.id
sqla_table.user_id = g.user.id
sqla_table.schema = excel_table.schema
sqla_table.fetch_metadata()
db.session.add(sqla_table)
db.session.commit()
except Exception as ex: # pylint: disable=broad-except
db.session.rollback()
try:
os.remove(uploaded_tmp_file_path)
except OSError:
pass
message = _(
'Unable to upload Excel file "%(filename)s" to table '
'"%(table_name)s" in database "%(db_name)s". '
"Error message: %(error_msg)s",
filename=form.excel_file.data.filename,
table_name=form.name.data,
db_name=database.database_name,
error_msg=str(ex),
)
flash(message, "danger")
stats_logger.incr("failed_excel_upload")
return redirect("/exceltodatabaseview/form")
os.remove(uploaded_tmp_file_path)
# Go back to welcome page / splash screen
message = _(
'CSV file "%(excel_filename)s" uploaded to table "%(table_name)s" in '
'database "%(db_name)s"',
excel_filename=form.excel_file.data.filename,
table_name=str(excel_table),
db_name=sqla_table.database.database_name,
)
flash(message, "info")
stats_logger.incr("successful_excel_upload")
return redirect("/tablemodelview/list/")