feat: create dtype option for csv upload (#23716)

2023-04-24 12:53:53 -07:00 · 2023-04-24 12:53:53 -07:00 · 71106cfd97
parent 4873c0990a
commit 71106cfd97
6 changed files with 160 additions and 2 deletions
--- a/superset/db_engine_specs/redshift.py
+++ b/superset/db_engine_specs/redshift.py
@ -18,12 +18,16 @@ import logging
 import re
 from typing import Any, Dict, Optional, Pattern, Tuple

+import pandas as pd
 from flask_babel import gettext as __
+from sqlalchemy.types import NVARCHAR

 from superset.db_engine_specs.base import BasicParametersMixin
 from superset.db_engine_specs.postgres import PostgresBaseEngineSpec
 from superset.errors import SupersetErrorType
+from superset.models.core import Database
 from superset.models.sql_lab import Query
+from superset.sql_parse import Table

 logger = logging.getLogger()

@ -96,6 +100,42 @@ class RedshiftEngineSpec(PostgresBaseEngineSpec, BasicParametersMixin):
        ),
    }

+    @classmethod
+    def df_to_sql(
+        cls,
+        database: Database,
+        table: Table,
+        df: pd.DataFrame,
+        to_sql_kwargs: Dict[str, Any],
+    ) -> None:
+        """
+        Upload data from a Pandas DataFrame to a database.
+
+        For regular engines this calls the `pandas.DataFrame.to_sql` method.
+        Overrides the base class to allow for pandas string types to be
+        used as nvarchar(max) columns, as redshift does not support
+        text data types.
+
+        Note this method does not create metadata for the table.
+
+        :param database: The database to upload the data to
+        :param table: The table to upload the data to
+        :param df: The dataframe with data to be uploaded
+        :param to_sql_kwargs: The kwargs to be passed to pandas.DataFrame.to_sql` method
+        """
+        to_sql_kwargs = to_sql_kwargs or {}
+        to_sql_kwargs["dtype"] = {
+            # uses the max size for redshift nvarchar(65335)
+            # the default object and string types create a varchar(256)
+            col_name: NVARCHAR(length=65535)
+            for col_name, type in zip(df.columns, df.dtypes)
+            if isinstance(type, pd.StringDtype)
+        }
+
+        super().df_to_sql(
+            df=df, database=database, table=table, to_sql_kwargs=to_sql_kwargs
+        )
+
    @staticmethod
    def _mutate_label(label: str) -> str:
        """
--- a/superset/templates/superset/form_view/csv_to_database_view/edit.html
+++ b/superset/templates/superset/form_view/csv_to_database_view/edit.html
@ -104,6 +104,10 @@
        {{ lib.render_field(form.overwrite_duplicate, begin_sep_label, end_sep_label, begin_sep_field,
        end_sep_field) }}
      </tr>
+      <tr>
+        {{ lib.render_field(form.dtype, begin_sep_label, end_sep_label, begin_sep_field,
+        end_sep_field) }}
+      </tr>
    {% endcall %}
    {% call csv_macros.render_collapsable_form_group("accordion3", "Rows") %}
      <tr>
--- a/superset/views/database/forms.py
+++ b/superset/views/database/forms.py
@ -140,6 +140,16 @@ class CsvToDatabaseForm(UploadToDatabaseForm):
        get_pk=lambda a: a.id,
        get_label=lambda a: a.database_name,
    )
+    dtype = StringField(
+        _("Column Data Types"),
+        description=_(
+            "A dictionary with column names and their data types"
+            " if you need to change the defaults."
+            ' Example: {"user_id":"integer"}'
+        ),
+        validators=[Optional()],
+        widget=BS3TextFieldWidget(),
+    )
    schema = StringField(
        _("Schema"),
        description=_("Select a schema if the database supports this"),
--- a/superset/views/database/views.py
+++ b/superset/views/database/views.py
@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import io
+import json
 import os
 import tempfile
 import zipfile
@ -189,6 +190,7 @@ class CsvToDatabaseView(CustomFormView):
            delimiter_input = form.otherInput.data

        try:
+            kwargs = {"dtype": json.loads(form.dtype.data)} if form.dtype.data else {}
            df = pd.concat(
                pd.read_csv(
                    chunksize=1000,
@ -208,6 +210,7 @@ class CsvToDatabaseView(CustomFormView):
                    skip_blank_lines=form.skip_blank_lines.data,
                    skipinitialspace=form.skip_initial_space.data,
                    skiprows=form.skiprows.data,
+                    **kwargs,
                )
            )

--- a/tests/integration_tests/csv_upload_tests.py
+++ b/tests/integration_tests/csv_upload_tests.py
@ -20,7 +20,7 @@ import json
 import logging
 import os
 import shutil
-from typing import Dict, Optional
+from typing import Dict, Optional, Union

 from unittest import mock

@ -129,7 +129,12 @@ def get_upload_db():
    return db.session.query(Database).filter_by(database_name=CSV_UPLOAD_DATABASE).one()


-def upload_csv(filename: str, table_name: str, extra: Optional[Dict[str, str]] = None):
+def upload_csv(
+    filename: str,
+    table_name: str,
+    extra: Optional[Dict[str, str]] = None,
+    dtype: Union[str, None] = None,
+):
    csv_upload_db_id = get_upload_db().id
    schema = utils.get_example_default_schema()
    form_data = {
@ -145,6 +150,8 @@ def upload_csv(filename: str, table_name: str, extra: Optional[Dict[str, str]] =
        form_data["schema"] = schema
    if extra:
        form_data.update(extra)
+    if dtype:
+        form_data["dtype"] = dtype
    return get_resp(test_client, "/csvtodatabaseview/form", data=form_data)


@ -386,6 +393,39 @@ def test_import_csv(mock_event_logger):
        data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").fetchall()
        assert data == [("john", 1, "x"), ("paul", 2, None)]

+    # cleanup
+    with get_upload_db().get_sqla_engine_with_context() as engine:
+        engine.execute(f"DROP TABLE {full_table_name}")
+
+    # with dtype
+    upload_csv(
+        CSV_FILENAME1,
+        CSV_UPLOAD_TABLE,
+        dtype='{"a": "string", "b": "float64"}',
+    )
+
+    # you can change the type to something compatible, like an object to string
+    # or an int to a float
+    # file upload should work as normal
+    with test_db.get_sqla_engine_with_context() as engine:
+        data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE}").fetchall()
+        assert data == [("john", 1), ("paul", 2)]
+
+    # cleanup
+    with get_upload_db().get_sqla_engine_with_context() as engine:
+        engine.execute(f"DROP TABLE {full_table_name}")
+
+    # with dtype - wrong type
+    resp = upload_csv(
+        CSV_FILENAME1,
+        CSV_UPLOAD_TABLE,
+        dtype='{"a": "int"}',
+    )
+
+    # you cannot pass an incompatible dtype
+    fail_msg = f"Unable to upload CSV file {escaped_double_quotes(CSV_FILENAME1)} to table {escaped_double_quotes(CSV_UPLOAD_TABLE)}"
+    assert fail_msg in resp
+

@pytest.mark.usefixtures("setup_csv_upload_with_context")
@pytest.mark.usefixtures("create_excel_files")
--- a/tests/integration_tests/db_engine_specs/redshift_tests.py
+++ b/tests/integration_tests/db_engine_specs/redshift_tests.py
@ -14,11 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import unittest.mock as mock
 from textwrap import dedent

+import numpy as np
+import pandas as pd
+from sqlalchemy.types import NVARCHAR
+
 from superset.db_engine_specs.redshift import RedshiftEngineSpec
 from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
+from superset.sql_parse import Table
 from tests.integration_tests.db_engine_specs.base_tests import TestDbEngineSpec
+from tests.integration_tests.test_app import app


 class TestRedshiftDbEngineSpec(TestDbEngineSpec):
@ -183,3 +190,57 @@ psql: error: could not connect to server: Operation timed out
                },
            )
        ]
+
+    def test_df_to_sql_no_dtype(self):
+        mock_database = mock.MagicMock()
+        mock_database.get_df.return_value.empty = False
+        table_name = "foobar"
+        data = [
+            ("foo", "bar", pd.NA, None),
+            ("foo", "bar", pd.NA, True),
+            ("foo", "bar", pd.NA, None),
+        ]
+        numpy_dtype = [
+            ("id", "object"),
+            ("value", "object"),
+            ("num", "object"),
+            ("bool", "object"),
+        ]
+        column_names = ["id", "value", "num", "bool"]
+
+        test_array = np.array(data, dtype=numpy_dtype)
+
+        df = pd.DataFrame(test_array, columns=column_names)
+        df.to_sql = mock.MagicMock()
+
+        with app.app_context():
+            RedshiftEngineSpec.df_to_sql(
+                mock_database, Table(table=table_name), df, to_sql_kwargs={}
+            )
+
+        assert df.to_sql.call_args[1]["dtype"] == {}
+
+    def test_df_to_sql_with_string_dtype(self):
+        mock_database = mock.MagicMock()
+        mock_database.get_df.return_value.empty = False
+        table_name = "foobar"
+        data = [
+            ("foo", "bar", pd.NA, None),
+            ("foo", "bar", pd.NA, True),
+            ("foo", "bar", pd.NA, None),
+        ]
+        column_names = ["id", "value", "num", "bool"]
+
+        df = pd.DataFrame(data, columns=column_names)
+        df = df.astype(dtype={"value": "string"})
+        df.to_sql = mock.MagicMock()
+
+        with app.app_context():
+            RedshiftEngineSpec.df_to_sql(
+                mock_database, Table(table=table_name), df, to_sql_kwargs={}
+            )
+
+        # varchar string length should be 65535
+        dtype = df.to_sql.call_args[1]["dtype"]
+        assert isinstance(dtype["value"], NVARCHAR)
+        assert dtype["value"].length == 65535