fix: edit pyarrow stringify to better handle emojis and accents (#22881)

This commit is contained in:
Elizabeth Thompson 2023-01-30 12:29:19 -08:00 committed by GitHub
parent c839d0daf5
commit f2b61fca15
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 97 additions and 12 deletions

View File

@ -72,7 +72,12 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
# pandas <NA> type cannot be converted to string
obj[na_obj] = None # type: ignore
else:
obj[...] = stringify(obj) # type: ignore
try:
# for simple string conversions
# this handles odd character types better
obj[...] = obj.astype(str) # type: ignore
except ValueError:
obj[...] = stringify(obj) # type: ignore
return result

View File

@ -57,10 +57,10 @@ def boxplot(
"""
def quartile1(series: Series) -> float:
return np.nanpercentile(series, 25, interpolation="midpoint") # type: ignore
return np.nanpercentile(series, 25, method="midpoint")
def quartile3(series: Series) -> float:
return np.nanpercentile(series, 75, interpolation="midpoint") # type: ignore
return np.nanpercentile(series, 75, method="midpoint")
if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:

View File

@ -169,13 +169,13 @@ class TestSupersetResultSet(SupersetTestCase):
"id": 4,
"dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
"num_arr": "[1, 2, 3]",
"map_col": '{"chart_name": "scatter"}',
"map_col": "{'chart_name': 'scatter'}",
},
{
"id": 3,
"dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
"num_arr": "[4, 5, 6]",
"map_col": '{"chart_name": "plot"}',
"map_col": "{'chart_name': 'plot'}",
},
],
)

View File

@ -55,7 +55,87 @@ def test_df_to_records_NaT_type() -> None:
assert df_to_records(df) == [
{"date": None},
{"date": '"2023-01-06T20:50:31.749000+00:00"'},
{"date": "2023-01-06 20:50:31.749000+00:00"},
]
def test_df_to_records_mixed_emoji_type() -> None:
from superset.db_engine_specs import BaseEngineSpec
from superset.result_set import SupersetResultSet
data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with an 😍 added", 2),
("What's up?", NaT, 3),
("What's up?", "Last emoji 😁", 4),
]
cursor_descr: DbapiDescription = [
("question", "varchar", None, None, None, None, False),
("response", "varchar", None, None, None, None, False),
("count", "integer", None, None, None, None, False),
]
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
df = results.to_pandas_df()
assert df_to_records(df) == [
{"question": "What's up?", "response": "This is a string text", "count": 1},
{
"question": "What's up?",
"response": "This is a string with an 😍 added",
"count": 2,
},
{
"question": "What's up?",
"response": None,
"count": 3,
},
{
"question": "What's up?",
"response": "Last emoji 😁",
"count": 4,
},
]
def test_df_to_records_mixed_accent_type() -> None:
from superset.db_engine_specs import BaseEngineSpec
from superset.result_set import SupersetResultSet
data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with áccent", 2),
("What's up?", NaT, 3),
("What's up?", "móre áccent", 4),
]
cursor_descr: DbapiDescription = [
("question", "varchar", None, None, None, None, False),
("response", "varchar", None, None, None, None, False),
("count", "integer", None, None, None, None, False),
]
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
df = results.to_pandas_df()
assert df_to_records(df) == [
{"question": "What's up?", "response": "This is a string text", "count": 1},
{
"question": "What's up?",
"response": "This is a string with áccent",
"count": 2,
},
{
"question": "What's up?",
"response": None,
"count": 3,
},
{
"question": "What's up?",
"response": "móre áccent",
"count": 4,
},
]

View File

@ -98,10 +98,10 @@ def test_stringify_with_null_integers():
expected = np.array(
[
array(['"foo"', '"foo"', '"foo"'], dtype=object),
array(['"bar"', '"bar"', '"bar"'], dtype=object),
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "true", None], dtype=object),
array([None, "True", None], dtype=object),
]
)
@ -132,10 +132,10 @@ def test_stringify_with_null_timestamps():
expected = np.array(
[
array(['"foo"', '"foo"', '"foo"'], dtype=object),
array(['"bar"', '"bar"', '"bar"'], dtype=object),
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "true", None], dtype=object),
array([None, "True", None], dtype=object),
]
)