mirror of https://github.com/apache/superset.git
chore: Support Python 3.10 and bump pandas 1.4 and pyarrow 6 (#21002)
* Bump pandas 1.4 and pyarrow 6 * Use engine="pyarrow" for pd.read_csv() * Refactoring * Refactoring * Refactoring * Use bytes in pd.read_json() * Fix test_contribution * Fix pandas issue when 'arrays' are empty but 'names' contain values * fix: ValueError: For argument "ascending" expected type bool, received type NoneType. * Remove engine="pyarrow" and convert bytes to string * make copy of selected df to fix regression * Simplify pd.read_json() and pd.read_csv() for example data Co-authored-by: Ville Brofeldt <ville.brofeldt@apple.com>
This commit is contained in:
parent
94e8fd3b35
commit
76d6a9af91
|
@ -420,7 +420,7 @@ Commits to `master` trigger a rebuild and redeploy of the documentation site. Su
|
|||
Make sure your machine meets the [OS dependencies](https://superset.apache.org/docs/installation/installing-superset-from-scratch#os-dependencies) before following these steps.
|
||||
You also need to install MySQL or [MariaDB](https://mariadb.com/downloads).
|
||||
|
||||
Ensure that you are using Python version 3.8 or 3.9, then proceed with:
|
||||
Ensure that you are using Python version 3.8, 3.9 or 3.10, then proceed with:
|
||||
|
||||
```bash
|
||||
# Create a virtual environment and activate it (recommended)
|
||||
|
|
6
Makefile
6
Makefile
|
@ -15,8 +15,8 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Python version installed; we need 3.8-3.9
|
||||
PYTHON=`command -v python3.9 || command -v python3.8`
|
||||
# Python version installed; we need 3.8-3.10
|
||||
PYTHON=`command -v python3.10 || command -v python3.9 || command -v python3.8`
|
||||
|
||||
.PHONY: install superset venv pre-commit
|
||||
|
||||
|
@ -70,7 +70,7 @@ update-js:
|
|||
|
||||
venv:
|
||||
# Create a virtual environment and activate it (recommended)
|
||||
if ! [ -x "${PYTHON}" ]; then echo "You need Python 3.8 or 3.9 installed"; exit 1; fi
|
||||
if ! [ -x "${PYTHON}" ]; then echo "You need Python 3.8, 3.9 or 3.10 installed"; exit 1; fi
|
||||
test -d venv || ${PYTHON} -m venv venv # setup a python3 virtualenv
|
||||
. venv/bin/activate
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ assists people when migrating to a new version.
|
|||
|
||||
- [20606](https://github.com/apache/superset/pull/20606): When user clicks on chart title or "Edit chart" button in Dashboard page, Explore opens in the same tab. Clicking while holding cmd/ctrl opens Explore in a new tab. To bring back the old behaviour (always opening Explore in a new tab), flip feature flag `DASHBOARD_EDIT_CHART_IN_NEW_TAB` to `True`.
|
||||
- [20799](https://github.com/apache/superset/pull/20799): Presto and Trino engine will now display tracking URL for running queries in SQL Lab. If for some reason you don't want to show the tracking URL (for example, when your data warehouse hasn't enable access for to Presto or Trino UI), update `TRACKING_URL_TRANSFORMER` in `config.py` to return `None`.
|
||||
- [21002](https://github.com/apache/superset/pull/21002): Support Python 3.10 and bump pandas 1.4 and pyarrow 6.
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
|
|
|
@ -185,7 +185,7 @@ packaging==21.3
|
|||
# via
|
||||
# bleach
|
||||
# deprecation
|
||||
pandas==1.3.4
|
||||
pandas==1.4.3
|
||||
# via apache-superset
|
||||
parsedatetime==2.6
|
||||
# via apache-superset
|
||||
|
@ -197,7 +197,7 @@ prison==0.2.1
|
|||
# via flask-appbuilder
|
||||
prompt-toolkit==3.0.28
|
||||
# via click-repl
|
||||
pyarrow==5.0.0
|
||||
pyarrow==6.0.1
|
||||
# via apache-superset
|
||||
pycparser==2.20
|
||||
# via cffi
|
||||
|
|
5
setup.py
5
setup.py
|
@ -100,7 +100,7 @@ setup(
|
|||
"markdown>=3.0",
|
||||
"msgpack>=1.0.0, <1.1",
|
||||
"numpy==1.22.1",
|
||||
"pandas>=1.3.0, <1.4",
|
||||
"pandas>=1.4.3, <1.5",
|
||||
"parsedatetime",
|
||||
"pgsanity",
|
||||
"polyline",
|
||||
|
@ -108,7 +108,7 @@ setup(
|
|||
"python-dateutil",
|
||||
"python-dotenv",
|
||||
"python-geohash",
|
||||
"pyarrow>=5.0.0, <6.0",
|
||||
"pyarrow>=6.0.1, <7",
|
||||
"pyyaml>=5.4",
|
||||
"PyJWT>=2.4.0, <3.0",
|
||||
"redis",
|
||||
|
@ -183,5 +183,6 @@ setup(
|
|||
classifiers=[
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
)
|
||||
|
|
|
@ -23,7 +23,7 @@ from sqlalchemy import inspect, String, Text
|
|||
from superset import db
|
||||
|
||||
from ..utils.database import get_example_database
|
||||
from .helpers import get_example_data, get_table_connector_registry
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
|
||||
|
||||
def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
|
||||
|
@ -34,8 +34,8 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
content = get_example_data("bart-lines.json.gz")
|
||||
df = pd.read_json(content, encoding="latin-1")
|
||||
url = get_example_url("bart-lines.json.gz")
|
||||
df = pd.read_json(url, encoding="latin-1", compression="gzip")
|
||||
df["path_json"] = df.path.map(json.dumps)
|
||||
df["polyline"] = df.path.map(polyline.encode)
|
||||
del df["path"]
|
||||
|
|
|
@ -33,7 +33,7 @@ from superset.utils.core import DatasourceType
|
|||
|
||||
from ..utils.database import get_example_database
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
|
@ -66,7 +66,8 @@ def gen_filter(
|
|||
|
||||
|
||||
def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
|
||||
pdf = pd.read_json(get_example_data("birth_names2.json.gz"))
|
||||
url = get_example_url("birth_names2.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
# TODO(bkyryliuk): move load examples data into the pytest fixture
|
||||
if database.backend == "presto":
|
||||
pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
|
||||
|
|
|
@ -27,7 +27,7 @@ from superset.models.slice import Slice
|
|||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
|
@ -44,10 +44,8 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
csv_bytes = get_example_data(
|
||||
"birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True
|
||||
)
|
||||
data = pd.read_csv(csv_bytes, encoding="utf-8")
|
||||
url = get_example_url("birth_france_data_for_country_map.csv")
|
||||
data = pd.read_csv(url, encoding="utf-8")
|
||||
data["dttm"] = datetime.datetime.now().date()
|
||||
data.to_sql(
|
||||
tbl_name,
|
||||
|
|
|
@ -28,7 +28,7 @@ from superset.models.slice import Slice
|
|||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
|
@ -46,8 +46,8 @@ def load_energy(
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("energy.json.gz")
|
||||
pdf = pd.read_json(data)
|
||||
url = get_example_url("energy.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf = pdf.head(100) if sample else pdf
|
||||
pdf.to_sql(
|
||||
tbl_name,
|
||||
|
|
|
@ -20,7 +20,7 @@ from sqlalchemy import DateTime, inspect
|
|||
import superset.utils.database as database_utils
|
||||
from superset import db
|
||||
|
||||
from .helpers import get_example_data, get_table_connector_registry
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
|
||||
|
||||
def load_flights(only_metadata: bool = False, force: bool = False) -> None:
|
||||
|
@ -32,12 +32,12 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None:
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("flight_data.csv.gz", make_bytes=True)
|
||||
pdf = pd.read_csv(data, encoding="latin-1")
|
||||
flight_data_url = get_example_url("flight_data.csv.gz")
|
||||
pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip")
|
||||
|
||||
# Loading airports info to join and get lat/long
|
||||
airports_bytes = get_example_data("airports.csv.gz", make_bytes=True)
|
||||
airports = pd.read_csv(airports_bytes, encoding="latin-1")
|
||||
airports_url = get_example_url("airports.csv.gz")
|
||||
airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip")
|
||||
airports = airports.set_index("IATA_CODE")
|
||||
|
||||
pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression
|
||||
|
|
|
@ -17,10 +17,7 @@
|
|||
"""Loads datasets, dashboards and slices in a new superset instance"""
|
||||
import json
|
||||
import os
|
||||
import zlib
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Set
|
||||
from urllib import request
|
||||
|
||||
from superset import app, db
|
||||
from superset.connectors.sqla.models import SqlaTable
|
||||
|
@ -73,14 +70,5 @@ def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
|
|||
return json.dumps(defaults_copy, indent=4, sort_keys=True)
|
||||
|
||||
|
||||
def get_example_data(
|
||||
filepath: str, is_gzip: bool = True, make_bytes: bool = False
|
||||
) -> BytesIO:
|
||||
content = request.urlopen( # pylint: disable=consider-using-with
|
||||
f"{BASE_URL}{filepath}?raw=true"
|
||||
).read()
|
||||
if is_gzip:
|
||||
content = zlib.decompress(content, zlib.MAX_WBITS | 16)
|
||||
if make_bytes:
|
||||
content = BytesIO(content)
|
||||
return content
|
||||
def get_example_url(filepath: str) -> str:
|
||||
return f"{BASE_URL}{filepath}?raw=true"
|
||||
|
|
|
@ -27,7 +27,7 @@ from superset.models.slice import Slice
|
|||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
|
@ -44,8 +44,8 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("san_francisco.csv.gz", make_bytes=True)
|
||||
pdf = pd.read_csv(data, encoding="utf-8")
|
||||
url = get_example_url("san_francisco.csv.gz")
|
||||
pdf = pd.read_csv(url, encoding="utf-8", compression="gzip")
|
||||
start = datetime.datetime.now().replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
)
|
||||
|
|
|
@ -25,7 +25,7 @@ from superset.utils.core import DatasourceType
|
|||
|
||||
from ..utils.database import get_example_database
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
|
@ -44,8 +44,8 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("multiformat_time_series.json.gz")
|
||||
pdf = pd.read_json(data)
|
||||
url = get_example_url("multiformat_time_series.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
# TODO(bkyryliuk): move load examples data into the pytest fixture
|
||||
if database.backend == "presto":
|
||||
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
|
||||
|
|
|
@ -22,7 +22,7 @@ from sqlalchemy import inspect, String, Text
|
|||
import superset.utils.database as database_utils
|
||||
from superset import db
|
||||
|
||||
from .helpers import get_example_data, get_table_connector_registry
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
|
||||
|
||||
def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None:
|
||||
|
@ -33,8 +33,8 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) ->
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("paris_iris.json.gz")
|
||||
df = pd.read_json(data)
|
||||
url = get_example_url("paris_iris.json.gz")
|
||||
df = pd.read_json(url, compression="gzip")
|
||||
df["features"] = df.features.map(json.dumps)
|
||||
|
||||
df.to_sql(
|
||||
|
|
|
@ -24,7 +24,7 @@ from superset.models.slice import Slice
|
|||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
|
@ -42,8 +42,8 @@ def load_random_time_series_data(
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("random_time_series.json.gz")
|
||||
pdf = pd.read_json(data)
|
||||
url = get_example_url("random_time_series.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
if database.backend == "presto":
|
||||
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
|
||||
pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")
|
||||
|
|
|
@ -22,7 +22,7 @@ from sqlalchemy import BigInteger, Float, inspect, Text
|
|||
import superset.utils.database as database_utils
|
||||
from superset import db
|
||||
|
||||
from .helpers import get_example_data, get_table_connector_registry
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
|
||||
|
||||
def load_sf_population_polygons(
|
||||
|
@ -35,8 +35,8 @@ def load_sf_population_polygons(
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("sf_population.json.gz")
|
||||
df = pd.read_json(data)
|
||||
url = get_example_url("sf_population.json.gz")
|
||||
df = pd.read_json(url, compression="gzip")
|
||||
df["contour"] = df.contour.map(json.dumps)
|
||||
|
||||
df.to_sql(
|
||||
|
|
|
@ -33,7 +33,7 @@ from superset.utils.core import DatasourceType
|
|||
|
||||
from ..connectors.base.models import BaseDatasource
|
||||
from .helpers import (
|
||||
get_example_data,
|
||||
get_example_url,
|
||||
get_examples_folder,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
|
@ -56,8 +56,8 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals, too-many-s
|
|||
table_exists = database.has_table_by_name(tbl_name)
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
data = get_example_data("countries.json.gz")
|
||||
pdf = pd.read_json(data)
|
||||
url = get_example_url("countries.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf.columns = [col.replace(".", "_") for col in pdf.columns]
|
||||
if database.backend == "presto":
|
||||
pdf.year = pd.to_datetime(pdf.year)
|
||||
|
|
|
@ -161,6 +161,9 @@ class SupersetResultSet:
|
|||
except Exception as ex: # pylint: disable=broad-except
|
||||
logger.exception(ex)
|
||||
|
||||
if not pa_data:
|
||||
column_names = []
|
||||
|
||||
self.table = pa.Table.from_arrays(pa_data, names=column_names)
|
||||
self._type_dict: Dict[str, Any] = {}
|
||||
try:
|
||||
|
|
|
@ -49,6 +49,9 @@ def contribution(
|
|||
"""
|
||||
contribution_df = df.copy()
|
||||
numeric_df = contribution_df.select_dtypes(include=["number", Decimal])
|
||||
# TODO: copy needed due to following regression in 1.4, remove if not needed:
|
||||
# https://github.com/pandas-dev/pandas/issues/48090
|
||||
numeric_df = numeric_df.copy()
|
||||
numeric_df.fillna(0, inplace=True)
|
||||
# verify column selections
|
||||
if columns:
|
||||
|
|
|
@ -2172,14 +2172,14 @@ class FilterBoxViz(BaseViz):
|
|||
if df is not None and not df.empty:
|
||||
if metric:
|
||||
df = df.sort_values(
|
||||
utils.get_metric_name(metric), ascending=flt.get("asc")
|
||||
utils.get_metric_name(metric), ascending=flt.get("asc", False)
|
||||
)
|
||||
data[col] = [
|
||||
{"id": row[0], "text": row[0], "metric": row[1]}
|
||||
for row in df.itertuples(index=False)
|
||||
]
|
||||
else:
|
||||
df = df.sort_values(col, ascending=flt.get("asc"))
|
||||
df = df.sort_values(col, ascending=flt.get("asc", False))
|
||||
data[col] = [
|
||||
{"id": row[0], "text": row[0]}
|
||||
for row in df.itertuples(index=False)
|
||||
|
|
Loading…
Reference in New Issue