superset/superset-frontend/plugins/legacy-plugin-chart-country-map/scripts/Country Map GeoJSON Generator.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "65aIalqEt1LR"
   },
   "source": [
    "# Generate GeoJSON from Natural Earth Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "L4PY3Z15t1LS"
   },
   "source": [
    "## Install Dependencies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6_H7qbzIt1LS"
   },
   "source": [
    "```\n",
    "pip install geopandas shapely matplotlib\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "hvA0SEXVt1LS"
   },
   "source": [
    "## Download Data\n",
    "\n",
    "Download datasets (_Admin 0 - Countries_ in [1:10](https://www.naturalearthdata.com/downloads/10m-cultural-vectors/), and _Admin 1 – States, Provinces_ in 1:10 and [1:50](https://www.naturalearthdata.com/downloads/50m-cultural-vectors/)) from Natural Earch Data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dependencies\n",
    "\n",
    "import os\n",
    "import requests\n",
    "import geopandas as gpd\n",
    "import matplotlib.pyplot as plt\n",
    "import shapely\n",
    "import pandas as pd\n",
    "import shapely.geometry\n",
    "import shapely.ops\n",
    "import shapely.affinity\n",
    "from shapely.geometry import Polygon, MultiPolygon\n",
    "import shutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "VjGrqW4Kt1LS",
    "outputId": "2e2accda-5ee4-4270-872e-ecb78d0d02a2"
   },
   "outputs": [],
   "source": [
    "data_dir = os.path.expanduser(\"~/Downloads\")\n",
    "if not os.path.exists(data_dir):\n",
    "    os.mkdir(data_dir)\n",
    "\n",
    "def download_files(skip_existing=True):\n",
    "    for url in [\n",
    "        \"https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip\",\n",
    "        \"https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip\",\n",
    "        \"https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_1_states_provinces.zip\"\n",
    "    ]:\n",
    "        file_name = url.split('/')[-1]\n",
    "        full_file_name = f'{data_dir}/{file_name}'\n",
    "        with requests.get(\n",
    "            url,\n",
    "            headers={\n",
    "                \"accept-encoding\": \"gzip, deflate, br\",\n",
    "                \"user-agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36\"\n",
    "            },\n",
    "            stream=True,\n",
    "        ) as res:\n",
    "            file_size = int(res.headers['content-length'])\n",
    "            if res.status_code != 200:\n",
    "                print(\"Error downloading files. Please open the URL to download them from browser manually.\")\n",
    "                break\n",
    "            if (\n",
    "                skip_existing and\n",
    "                os.path.exists(full_file_name) and\n",
    "                file_size == os.path.getsize(full_file_name)\n",
    "            ):\n",
    "                print(f\"Skip {file_name} because it already exists\")\n",
    "                continue\n",
    "            print(f\"Downloading {file_name}... \\r\", end=\"\")\n",
    "            with open(full_file_name, \"wb\") as fh:\n",
    "                fh.write(res.content)\n",
    "    print(\"Done.                                                               \")\n",
    "\n",
    "download_files(skip_existing=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "EL0e9DEVt1LT",
    "outputId": "16cd6450-d4a3-457a-b205-9797bbce33fc"
   },
   "outputs": [],
   "source": [
    "# Read Natural Earth data files into GeoDataFrames\n",
    "df_admin0_10m = gpd.read_file(f\"{data_dir}/ne_10m_admin_0_countries.zip\")\n",
    "df_10m = gpd.read_file(f\"{data_dir}/ne_10m_admin_1_states_provinces.zip\")\n",
    "df_50m = gpd.read_file(f\"{data_dir}/ne_50m_admin_1_states_provinces.zip\")\n",
    "\n",
    "# Convert column names to lowercase\n",
    "df_admin0_10m.columns = df_admin0_10m.columns.str.lower()\n",
    "\n",
    "# Download and load the GeoJSON file for India\n",
    "india_geojson_url = \"https://github.com/geohacker/india/raw/bcb920c7d3c686f01d085f7661c9ba89bf9bf65e/state/india_state_kashmir_ladakh.geojson\"\n",
    "\n",
    "try:\n",
    "    india_gdf = gpd.read_file(india_geojson_url)\n",
    "    print(\"GeoJSON file for India downloaded and loaded successfully.\")\n",
    "except Exception as e:\n",
    "    print(f\"Unable to download or load the GeoJSON file for India. Error: {str(e)}\")\n",
    "    print(\"Please download the file from the URL and try again.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "DUrz04nYt1LT",
    "outputId": "18d7cdb0-8ab6-4238-e50c-925c5dc117b0"
   },
   "outputs": [],
   "source": [
    "df_50m.groupby('admin').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "eUlJjdRkt1LT",
    "outputId": "60df2dc3-800e-40ac-f151-696a7f91cff4"
   },
   "outputs": [],
   "source": [
    "df_50m[df_50m.adm0_a3 == 'USA'].plot(figsize=(20,10))\n",
    "plt.show()\n",
    "\n",
    "india_gdf.plot(figsize=(20, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "pr1jqM3kt1LU",
    "outputId": "7211a182-b64a-469b-fadb-af2148ec6852"
   },
   "outputs": [],
   "source": [
    "# Use 1:50m geometry for some large countries:\n",
    "\n",
    "print(*df_50m['admin'].unique(), sep='\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "k-KuZ8L4t1LU"
   },
   "outputs": [],
   "source": [
    "df = pd.concat([df_10m[~df_10m['admin'].isin(df_50m['admin'].unique())], df_50m])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "06nhCSvTt1LU"
   },
   "source": [
    "## Adjust the Maps\n",
    "\n",
    "<span style=\"color: red; font-size: 1.5em\">TO SUPPORT NEW COUNTRIES, ADD COUNTRY NAME BELOW</span>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-4uH5XaEt1LU"
   },
   "outputs": [],
   "source": [
    "# Country names used in file names\n",
    "countries = [\n",
    "  'afghanistan',\n",
    "  'albania',\n",
    "  'algeria',\n",
    "  'argentina',\n",
    "  'australia',\n",
    "  'austria',\n",
    "  'belgium',\n",
    "  'bolivia',\n",
    "  'brazil',\n",
    "  'bulgaria',\n",
    "  'burundi',\n",
    "  'canada',\n",
    "  'chile',\n",
    "  'china',\n",
    "  'colombia',\n",
    "  'costa rica',\n",
    "  'cuba',\n",
    "  'cyprus',\n",
    "  'denmark',\n",
    "  'dominican republic',\n",
    "  'ecuador',\n",
    "  'egypt',\n",
    "  'el salvador',\n",
    "  'estonia',\n",
    "  'ethiopia',\n",
    "  'france',\n",
    "  'france_regions', # this is one derived from france - see below\n",
    "  'finland',\n",
    "  'germany',\n",
    "  'guatemala',\n",
    "  'haiti',\n",
    "  'honduras',\n",
    "  'iceland',\n",
    "  'india',\n",
    "  'indonesia',\n",
    "  'iran',\n",
    "  'italy',\n",
    "  'italy_regions', # this one is derived from italy - see below\n",
    "  'japan',\n",
    "  'jordan',\n",
    "  'kazakhstan',\n",
    "  'kenya',\n",
    "  'korea',\n",
    "  'kuwait',\n",
    "  'kyrgyzstan',\n",
    "  'latvia',\n",
    "  'liechtenstein',\n",
    "  'lithuania',\n",
    "  'malaysia',\n",
    "  'mexico',\n",
    "  'morocco',\n",
    "  'myanmar',\n",
    "  'netherlands',\n",
    "  'nicaragua',\n",
    "  'nigeria',\n",
    "  'norway',\n",
    "  'oman',\n",
    "  'pakistan',\n",
    "  'panama',\n",
    "  'papua new guinea',\n",
    "  'paraguay',\n",
    "  'peru',\n",
    "  'philippines',\n",
    "  'portugal',\n",
    "  'poland',\n",
    "  'puerto rico',\n",
    "  'qatar',\n",
    "  'russia',\n",
    "  'rwanda',\n",
    "  'saint barthelemy',\n",
    "  'saint martin',\n",
    "  'saudi arabia',\n",
    "  'singapore',\n",
    "  'slovenia',\n",
    "  'spain',\n",
    "  'sri lanka',\n",
    "  'sweden',\n",
    "  'switzerland',\n",
    "  'syria',\n",
    "  'tajikistan',\n",
    "  'tanzania',\n",
    "  'thailand',\n",
    "  'timorleste',\n",
    "  'turkey',\n",
    "  'turkey_regions', # this one derived from turkey - see below\n",
    "  'turkmenistan',\n",
    "  'uganda',\n",
    "  'uk',\n",
    "  'ukraine',\n",
    "  'united arab emirates',\n",
    "  'uruguay',\n",
    "  'usa',\n",
    "  'uzbekistan',\n",
    "  'venezuela',\n",
    "  'vietnam',\n",
    "  'zambia',\n",
    "]\n",
    "\n",
    "# country name used in dataset\n",
    "country_name_aliases = {\n",
    "    \"uk\": \"united kingdom\",\n",
    "    \"usa\": \"united states of america\",\n",
    "    \"korea\": \"south korea\",\n",
    "    \"timorleste\": \"east timor\",\n",
    "    \"tanzania\": \"united republic of tanzania\",\n",
    "}\n",
    "\n",
    "# CSV files that exist specifically on the repo, rather than in the dataset\n",
    "custom_countries = [\n",
    "    \n",
    "]\n",
    "\n",
    "# Make sure all country names are covered:\n",
    "invalid_countries = [x for x in countries if (country_name_aliases.get(x, x) not in df[\"admin\"].str.lower().unique()) and (x not in custom_countries)]\n",
    "\n",
    "if invalid_countries:\n",
    "  print(f\"Following country names are not valid: {invalid_countries}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "7z--iQz4t1LU"
   },
   "source": [
    "Preview all countries:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tJ_WNQl8t1LU",
    "outputId": "4f601ce0-26e4-4a40-c36c-8449420e9406"
   },
   "outputs": [],
   "source": [
    "alt_maps = dict()\n",
    "\n",
    "def get_gdf(country):\n",
    "    country_alias = country_name_aliases.get(country, country)\n",
    "    if country in alt_maps:\n",
    "        gdf = alt_maps[country]\n",
    "    elif country in custom_countries:\n",
    "        gdf = gpd.read_file(f'../src/countries_custom/{country}.geojson')\n",
    "    else:\n",
    "        gdf = df[df[\"admin\"].str.lower() == country_alias]\n",
    "    return gdf.copy()\n",
    "\n",
    "def plot_all_countries():\n",
    "    plt.figure(figsize=(20, 20))\n",
    "\n",
    "    for i, country in enumerate(countries):\n",
    "        # create subplot axes in a 3x3 grid\n",
    "        ax = plt.subplot(len(countries) // 5, 6, i + 1) # nrows, ncols, axes position\n",
    "        gdf = get_gdf(country)\n",
    "        if not gdf.empty:  # check if GeoDataFrame is not empty\n",
    "            gdf.plot(ax=ax)\n",
    "            ax.set_aspect('equal', adjustable='datalim')\n",
    "        else:  # if GeoDataFrame is empty\n",
    "            ax.text(0.5, 0.5, country, ha='center', va='center')  # add country name to the center of the subplot\n",
    "        ax.set_title(country)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_all_countries()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "7Ab0_rHVt1LU"
   },
   "source": [
    "### Handle countries with flying islands\n",
    "\n",
    "- For countries with flying islands, we need to move the islands closer to the mainland."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Z4y46Zuot1LU"
   },
   "source": [
    "#### USA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "xx8IbBKtt1LU",
    "outputId": "025139d2-ba0b-43a9-e2ec-f4608e6ecad2"
   },
   "outputs": [],
   "source": [
    "usa = df[df['adm0_a3'] == 'USA']\n",
    "usa.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ixC6KENXt1LU",
    "outputId": "8e63cb2d-d733-4a9f-caf6-ccc843f15b5d"
   },
   "outputs": [],
   "source": [
    "def reposition(df, idx, xoff=None, yoff=None, xscale=None, yscale=None, simplify=None):\n",
    "\n",
    "    def move_and_scale(series):\n",
    "        if xoff or yoff:\n",
    "            series = shapely.affinity.translate(series, xoff or 0, yoff or 0)\n",
    "        if xscale or yscale:\n",
    "            series = shapely.affinity.scale(series, xscale or 1, yscale or 1)\n",
    "        if simplify:\n",
    "            series = series.simplify(simplify, preserve_topology=False)\n",
    "        return series\n",
    "\n",
    "    df.loc[idx, 'geometry'] = df.loc[idx, 'geometry'].apply(move_and_scale)\n",
    "\n",
    "\n",
    "usa_copy = usa.copy()\n",
    "reposition(usa_copy, usa.name == 'Hawaii', 51, 5.5)\n",
    "reposition(usa_copy, usa.name == 'Alaska', 35, -34, 0.35, 0.35)\n",
    "\n",
    "usa_copy.plot(figsize=(8,8))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "d1p9cWNxt1LU"
   },
   "source": [
    "#### China\n",
    "\n",
    "China claims sovereign over Taiwan. For disputed territories, we respect each country and give them what they want.\n",
    "\n",
    "In addition, Hong Kong and Macau should also be included in a China map."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "vN4Ngpe7t1LU",
    "outputId": "3bcdc612-cc01-49be-fe19-f6e08e833fca"
   },
   "outputs": [],
   "source": [
    "# Chinese Special Administrative Regions\n",
    "china_sars = df_admin0_10m.loc[\n",
    "    df_admin0_10m.name_en.isin(['Taiwan', 'Hong Kong', 'Macau']),\n",
    "    [x for x in df_admin0_10m.columns if x in df.columns]\n",
    "]\n",
    "china_sars = china_sars.merge(pd.DataFrame(\n",
    "    data={\n",
    "        \"name_en\": [\"Taiwan\", \"Hong Kong\", \"Macau\"],\n",
    "        \"name_zh\": [\"中国台湾\", \"香港特别行政区\", \"澳门特别行政区\"],\n",
    "        \"iso_3166_2\": [\"CN-71\", \"CN-91\", \"CN-92\"],\n",
    "    },\n",
    "), on=\"name_en\", how=\"left\")\n",
    "china_sars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PP6E24eEt1LV",
    "outputId": "2621d5f1-1edc-42fc-e8df-8afd6a525cc6",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "china = df[df.admin == \"China\"]\n",
    "china_copy = pd.concat([china, china_sars], ignore_index=True)\n",
    "\n",
    "# Combine the 'name_zh' columns\n",
    "china_copy[\"name_zh\"] = china_copy[\"name_zh\"].combine_first(china_copy[\"name_zh_y\"])\n",
    "\n",
    "# Drop the extra 'name_zh_x' and 'name_zh_y' columns, if they exist\n",
    "china_copy = china_copy.drop([\"name_zh_x\", \"name_zh_y\"], axis=1)\n",
    "\n",
    "# Plotting the DataFrame\n",
    "china_copy.plot(figsize=(12, 12))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nqn5qsR-t1LV"
   },
   "source": [
    "Note [ISO-3166-2:CN](https://en.wikipedia.org/wiki/ISO_3166-2:CN) has updated subdivisions to use letters instead of numbers (e.g. `CN-91` -> `CN-HK`). We kept the numeric code for backward compatibility."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "iNnVk5dut1LV"
   },
   "source": [
    "#### Finland\n",
    "\n",
    "- The Åland Islands (ISO country code AX) is an autonomous region of Finland, and carries the ISO-3166 code FI-01."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "LuNGgwiQt1LV"
   },
   "outputs": [],
   "source": [
    "finland_aland = df_admin0_10m.loc[\n",
    "    df_admin0_10m.name_en.isin(['Åland']),\n",
    "    [x for x in df_admin0_10m.columns if x in df.columns]\n",
    "]\n",
    "finland_aland = finland_aland.merge(pd.DataFrame(\n",
    "    data={\n",
    "        \"name_en\": [\"Åland\"],\n",
    "        \"name_fi\": [\"Ahvenanmaan maakunta\"],\n",
    "        \"iso_3166_2\": [\"FI-01\"],\n",
    "    },\n",
    "), on=\"name_en\", how=\"left\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "v8ig_jQDt1LV",
    "outputId": "3f10b14d-dde2-46d9-f4f6-6f4311fb3e73"
   },
   "outputs": [],
   "source": [
    "finland = df[df.admin == \"Finland\"]\n",
    "\n",
    "# Concatenate the 'finland' DataFrame with 'finland_aland' DataFrame\n",
    "finland_copy = pd.concat([finland, finland_aland], ignore_index=True)\n",
    "\n",
    "# Combine 'name_fi' columns. However, since both columns are named 'name_fi', this might be redundant\n",
    "# If you have two different columns for 'name_fi' values in each DataFrame, specify them as 'name_fi_x' and 'name_fi_y'\n",
    "finland_copy[\"name_fi\"] = finland_copy[\"name_fi\"].combine_first(finland_copy[\"name_fi\"])\n",
    "\n",
    "# Drop the 'name_fi' column, if that's intended. This will remove the 'name_fi' data entirely.\n",
    "# If you meant to drop other columns (like 'name_fi_x' and 'name_fi_y'), update the column names accordingly\n",
    "finland_copy = finland_copy.drop([\"name_fi\"], axis=1)\n",
    "\n",
    "# Plotting the DataFrame\n",
    "finland_copy.plot(figsize=(12, 12))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "UP0QB9BZt1LV"
   },
   "source": [
    "#### Norway\n",
    "\n",
    "- Remove NO-X01~ (The uninhabited Bouvet Island) and move Svalbard closer to mainland"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8zBzSIqQt1LV",
    "outputId": "cc8b6fbf-accb-44ba-b80a-a837df398c96"
   },
   "outputs": [],
   "source": [
    "norway = df[df['adm0_a3'] == 'NOR']\n",
    "norway.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-LXcKKOjt1LV",
    "outputId": "546a286e-9682-4f9a-c57e-b19250d88a34"
   },
   "outputs": [],
   "source": [
    "norway_copy = norway.copy()\n",
    "\n",
    "norway_copy = norway_copy[norway_copy[\"iso_3166_2\"] != \"NO-X01~\"]\n",
    "reposition(norway_copy, norway.name == 'Svalbard', -12, -8, 0.5, 0.5)\n",
    "#reposition(norway_copy, norway.name == 'Nordland', 10, 0, 2, 2)\n",
    "\n",
    "norway_copy.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NqdSwt2ct1LV"
   },
   "source": [
    "#### Portugal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "mznw0XOgt1LV",
    "outputId": "7e8085bc-abd9-4592-f047-62fa1a45eb01"
   },
   "outputs": [],
   "source": [
    "portugal = df[df.admin == 'Portugal']\n",
    "portugal.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "yfAO1qFrt1LV",
    "outputId": "9151ce8f-2412-415b-da73-eeec613276d8"
   },
   "outputs": [],
   "source": [
    "portugal_copy = portugal.copy()\n",
    "\n",
    "reposition(portugal_copy, portugal.name == 'Azores', 11, 0)\n",
    "reposition(portugal_copy, portugal.name == 'Madeira', 6, 2, simplify=0.015)\n",
    "\n",
    "portugal_copy.plot(figsize=(8, 8))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jJyypJbJt1LV"
   },
   "source": [
    "#### Spain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "bbyDCO0Qt1LV",
    "outputId": "f2a0594d-999b-4573-d008-5158f898a1c6"
   },
   "outputs": [],
   "source": [
    "spain = df[df.admin == 'Spain']\n",
    "spain.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "yJ_Ueh7Rt1LV",
    "outputId": "16fe59db-4be4-4e02-d37b-3098bdfa945a"
   },
   "outputs": [],
   "source": [
    "spain_copy = spain.copy()\n",
    "\n",
    "reposition(spain_copy, spain.name.isin(['Las Palmas', 'Santa Cruz de Tenerife']), 3, 7, 1, 1)\n",
    "\n",
    "spain_copy.plot(figsize=(8, 8))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-SNb1b-Et1LV"
   },
   "source": [
    "#### Russia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "60UpJMNwt1LV",
    "outputId": "1c9ff3fa-83e6-411e-9dc3-0c718ee97d39"
   },
   "outputs": [],
   "source": [
    "russia = df[df.admin == 'Russia']\n",
    "russia.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IOuQ_OzMt1LW"
   },
   "source": [
    "- Russia looks off because of Chukchi runs across E180. We need to move the parts on the other side of the map to the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "rfBkQf78t1LW",
    "outputId": "8342e4b8-2483-4aac-8a79-e88d455297e2",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def shift_geom(geom, cutoff=0):\n",
    "    border = shapely.geometry.LineString([(cutoff, -90), (cutoff, 90)])\n",
    "    splitted_geom = shapely.ops.split(geom, border)\n",
    "\n",
    "    # Create a list to store moved geometries\n",
    "    moved_geom = []\n",
    "\n",
    "    # Check if the split operation returned a GeometryCollection\n",
    "    if isinstance(splitted_geom, shapely.geometry.GeometryCollection):\n",
    "        # Iterate over each geometry in the GeometryCollection\n",
    "        for item in splitted_geom.geoms:\n",
    "            minx, miny, maxx, maxy = item.bounds\n",
    "            if minx < cutoff:\n",
    "                # Translate the geometry\n",
    "                moved_geom.append(shapely.affinity.translate(item, xoff=360 - cutoff))\n",
    "            else:\n",
    "                moved_geom.append(item)\n",
    "    else:\n",
    "        # If the result is not a GeometryCollection, it means no split occurred\n",
    "        moved_geom.append(geom)\n",
    "\n",
    "    # Combine all moved geometries into a single geometry\n",
    "    return shapely.ops.unary_union(moved_geom)\n",
    "\n",
    "# Applying the function to the DataFrame\n",
    "russia_copy = russia.copy()\n",
    "russia_copy.loc[\n",
    "    russia.name == 'Chukchi Autonomous Okrug', 'geometry'\n",
    "] = russia_copy.loc[\n",
    "    russia.name == 'Chukchi Autonomous Okrug', 'geometry'\n",
    "].apply(shift_geom)\n",
    "\n",
    "# Plotting\n",
    "russia_copy.plot(figsize=(20, 20))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Turkey"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Turkey Regions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "turkey = df[df.admin == 'Turkey'][['iso_3166_2','geometry']]\n",
    "turkey.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NUTS - 1 Codes for Turkey and correspong region - city names\n",
    "\n",
    "region_dict = {\n",
    " 'TR1': ['TR-34'],\n",
    " 'TR2': ['TR-59', 'TR-22', 'TR-39', 'TR-10', 'TR-17'],\n",
    " 'TR3': ['TR-35', 'TR-09', 'TR-20', 'TR-48', 'TR-45', 'TR-03', 'TR-43', 'TR-64'],\n",
    " 'TR4': ['TR-16', 'TR-26', 'TR-11', 'TR-41', 'TR-54', 'TR-81', 'TR-14', 'TR-77'],\n",
    " 'TR5': ['TR-06', 'TR-42', 'TR-70'],\n",
    " 'TR6': ['TR-07', 'TR-32', 'TR-15', 'TR-01', 'TR-33', 'TR-31', 'TR-46', 'TR-80'],\n",
    " 'TR7': ['TR-71', 'TR-68', 'TR-51', 'TR-50', 'TR-40', 'TR-38', 'TR-58', 'TR-66'],\n",
    " 'TR8': ['TR-67', 'TR-78', 'TR-74', 'TR-37', 'TR-18', 'TR-57', 'TR-55', 'TR-60', 'TR-19', 'TR-05'],\n",
    " 'TR9': ['TR-61', 'TR-52', 'TR-28', 'TR-53', 'TR-08', 'TR-29'],\n",
    " 'TRA': ['TR-25', 'TR-24', 'TR-69', 'TR-04', 'TR-36', 'TR-76', 'TR-75'],\n",
    " 'TRB': ['TR-44', 'TR-23', 'TR-12', 'TR-62', 'TR-65', 'TR-49', 'TR-13', 'TR-30'],\n",
    " 'TRC': ['TR-27', 'TR-02', 'TR-79', 'TR-63', 'TR-21', 'TR-47', 'TR-72', 'TR-73', 'TR-56']}\n",
    "\n",
    "# Region names corresponding to NUTS-1\n",
    "\n",
    "region_name_dict = {'TR1':'İstanbul',\n",
    "                    'TR2':'Batı Marmara',\n",
    "                    'TR3':'Ege',\n",
    "                    'TR4':'Doğu Marmara',\n",
    "                    'TR5':'Batı Anadolu',\n",
    "                    'TR6':'Akdeniz',\n",
    "                    'TR7':'Orta Anadolu',\n",
    "                    'TR8':'Batı Karadeniz',\n",
    "                    'TR9':'Doğu Karadeniz',\n",
    "                    'TRA':'Kuzeydoğu Anadolu',\n",
    "                    'TRC':'Güneydoğu Anadolu',\n",
    "                    'TRB':'Ortadoğu Anadolu'\n",
    "                    }\n",
    "\n",
    "\n",
    "def create_region_polygons(region_dict, turkey_gdf):\n",
    "    # Create a reverse dictionary where city codes map to region codes\n",
    "    city_to_region = {city_code: region_code for region_code, city_codes in region_dict.items() for city_code in city_codes}\n",
    "\n",
    "    # Create a new column 'REGION' in the GeoDataFrame that maps each city to its region\n",
    "    turkey_gdf['REGION'] = turkey_gdf['iso_3166_2'].map(city_to_region)\n",
    "\n",
    "    # Dissolve the GeoDataFrame on the 'REGION' column to combine city polygons into region polygons\n",
    "    region_gdf = turkey_gdf.dissolve(by='REGION')\n",
    "\n",
    "    # Reset the index of the new GeoDataFrame\n",
    "    region_gdf.reset_index(inplace=True)\n",
    "    \n",
    "    return region_gdf.drop(columns=['iso_3166_2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "turkey_regions = create_region_polygons(region_dict, turkey)\n",
    "\n",
    "# Rename 'REGION' column to 'ISO'\n",
    "turkey_regions = turkey_regions.rename(columns={'REGION': 'iso_3166_2'})\n",
    "\n",
    "# Map the region_name_dict to a new 'NAME_1' column\n",
    "turkey_regions['name'] = turkey_regions['iso_3166_2'].map(region_name_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "turkey_regions.plot(figsize=(10, 7), edgecolor='black', column='name', legend=False, cmap='tab20')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "aYFQYe8-t1LW"
   },
   "source": [
    "### France"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "AcT31Diyt1LW",
    "outputId": "cd6cc6ef-43ba-478e-b183-84eb7e003e17"
   },
   "outputs": [],
   "source": [
    "france = df[df.admin == 'France']\n",
    "france.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "p7Y4Vf6pt1LW"
   },
   "source": [
    "Move the [Overseas departments and regions of France](https://en.wikipedia.org/wiki/Overseas_departments_and_regions_of_France) closer to mainland."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fix some department names\n",
    "\n",
    "- Seien-et-Marne => Seine-et-Marne\n",
    "- Haute-Rhin => Haut-Rhin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_name(df, old, new):\n",
    "    if old in list(df.name):  \n",
    "        index = df[df.name == old].index[0]\n",
    "        df.at[index, 'name'] = new\n",
    "        \n",
    "replace_name(france, 'Seien-et-Marne', 'Seine-et-Marne')\n",
    "replace_name(france, 'Haute-Rhin', 'Haut-Rhin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "yjKX9Pbbt1LW",
    "outputId": "14caae01-b1b0-4775-a00e-a9e4f30fdf73"
   },
   "outputs": [],
   "source": [
    "france_copy = france.copy()\n",
    "reposition(france_copy, france.name=='Guadeloupe', 57.4, 25.4, 1.5, 1.5)\n",
    "reposition(france_copy, france.name=='Martinique', 58.4, 27.1, 1.5, 1.5)\n",
    "reposition(france_copy, france.name=='Guyane française', 52, 37.7, 0.35, 0.35)\n",
    "reposition(france_copy, france.name=='La Réunion', -55, 62.8, 1.5, 1.5)\n",
    "reposition(france_copy, france.name=='Mayotte', -43, 54.3, 1.5, 1.5)\n",
    "\n",
    "france_copy.plot(figsize=(8, 8))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### France Regions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "france_regions = france_copy[['geometry','region_cod','region']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "france_regions = france_regions.dissolve(by=['region_cod', 'region']).reset_index()\n",
    "\n",
    "france_regions = france_regions.rename(columns={'region': 'name', 'region_cod': 'iso_3166_2'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "france_regions.plot(figsize=(10, 7), edgecolor='black', column='iso_3166_2', legend=False, cmap='tab20')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Italy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Italy Regions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "italy_regions = df[df.admin == 'Italy'][['geometry','region_cod','region']]\n",
    "\n",
    "italy_regions = italy_regions.dissolve(by=['region_cod', 'region']).reset_index()\n",
    "\n",
    "italy_regions = italy_regions.rename(columns={'region': 'name', 'region_cod': 'iso_3166_2'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "italy_regions.plot(figsize=(10, 7), edgecolor='black', column='iso_3166_2', legend=False, cmap='tab20')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "d1T6jfJPt1LW"
   },
   "source": [
    "#### Netherlands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_bounds(df, northwest, southeast):\n",
    "    x1, y1 = northwest\n",
    "    x2, y2 = southeast\n",
    "    boundry = shapely.geometry.Polygon([(x1, y1),(x1, y2), (x2, y2), (x2, y1)])\n",
    "    df = df.copy()\n",
    "    return df[df.geometry.apply(lambda x: boundry.contains(x))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "IS5Gcxgct1LW",
    "outputId": "b8dbb05f-4ca9-4884-83ac-a7c169a9830a"
   },
   "outputs": [],
   "source": [
    "netherlands = df[df.admin == 'Netherlands']\n",
    "netherlands.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "nwUGucQ1t1LW",
    "outputId": "26c5aede-c587-4d88-cfe0-30ecaec9ede3"
   },
   "outputs": [],
   "source": [
    "netherlands_copy = apply_bounds(netherlands, (-20, 60), (20, 20))\n",
    "netherlands_copy.plot(figsize=(8, 8))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "tTpJe28jt1LW"
   },
   "source": [
    "#### UK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "xfMx6gJmt1LW",
    "outputId": "5278dfc3-3f51-4c21-84cc-922251b1d0cb"
   },
   "outputs": [],
   "source": [
    "uk = df[df.admin == 'United Kingdom']\n",
    "uk.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "28VU40f9t1LW",
    "outputId": "45585067-de13-4e02-8147-053ef0115d2d"
   },
   "outputs": [],
   "source": [
    "uk_copy = apply_bounds(uk, (-10, 60), (20, 20))\n",
    "uk_copy.plot(figsize=(8, 8))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Fb58eGlIt1LW"
   },
   "source": [
    "## Output GeoJSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "5xOVyzXCt1LW"
   },
   "outputs": [],
   "source": [
    "alt_maps = {\n",
    "    \"finland\": finland_copy,\n",
    "    \"china\": china_copy,\n",
    "    \"usa\": usa_copy,\n",
    "    \"france\": france_copy,\n",
    "    \"france_regions\": france_regions,\n",
    "    \"turkey_regions\": turkey_regions,\n",
    "    \"italy_regions\": italy_regions,\n",
    "    \"netherlands\": netherlands_copy,\n",
    "    \"norway\": norway_copy,\n",
    "    \"uk\": uk_copy,\n",
    "    \"russia\": russia_copy,\n",
    "    \"spain\": spain_copy,\n",
    "    \"portugal\": portugal_copy,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tM1F5d0Vt1LW",
    "outputId": "75abad9b-9442-4279-d66d-a0cd5fb97198"
   },
   "outputs": [],
   "source": [
    "plot_all_countries()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8U3S1PUbt1LW",
    "outputId": "cfb8d229-ffdf-473f-d516-6aa136e41a60",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "simplify_factors = {\n",
    "    \"uk\": 0.005,\n",
    "}\n",
    "useful_columns = [\"ISO\", \"NAME_1\", \"geometry\"]\n",
    "\n",
    "def get_simplify_factor_by_size(gdf):\n",
    "    xmin, ymin, xmax, ymax = shapely.ops.unary_union(gdf[\"geometry\"]).bounds\n",
    "    size = (xmax - xmin) * (ymax - ymin)\n",
    "    print(\"Size\", round(size, 3), end=\"\\t\")\n",
    "    if size > 1000: return 0.03\n",
    "    if size > 300: return 0.02\n",
    "    if size > 100: return 0.01\n",
    "    return 0\n",
    "\n",
    "def simplify_if_needed(country, gdf):\n",
    "    \"\"\"Simplify the maps based on country size\"\"\"\n",
    "    country_alias = country_name_aliases.get(country, country)\n",
    "    if country_alias in df_50m[\"admin\"].str.lower().unique():\n",
    "        return\n",
    "\n",
    "    factor = simplify_factors.get(country) or get_simplify_factor_by_size(gdf)\n",
    "\n",
    "    if factor:\n",
    "        gdf[\"geometry\"] = gdf.simplify(factor)\n",
    "\n",
    "def save_geojson(country):\n",
    "    if country in custom_countries:\n",
    "        shutil.copy(f\"../src/countries_custom/{country}.geojson\", f\"../src/countries/{country}.geojson\")\n",
    "    else:\n",
    "        gdf = get_gdf(country)\n",
    "        print(country, end=\"\\t\")\n",
    "    \n",
    "        # For backward compatibility\n",
    "        gdf[\"ISO\"] = gdf[\"iso_3166_2\"]\n",
    "        gdf[\"NAME_1\"] = gdf[\"name\"]\n",
    "    \n",
    "        simplify_if_needed(country, gdf)\n",
    "    \n",
    "        print(f'Saving geojson for {country}...')\n",
    "        filename_country = country.replace(' ', '_')\n",
    "        gdf[useful_columns].to_file(f\"../src/countries/{filename_country}.geojson\", driver=\"GeoJSON\")\n",
    "\n",
    "for country in countries:\n",
    "    save_geojson(country)\n",
    "\n",
    "print(\"Done.                          \")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Fb58eGlIt1LW"
   },
   "source": [
    "## Output Typescript"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to convert country name to a valid JavaScript identifier\n",
    "def to_js_identifier(name):\n",
    "    return name.replace(' ', '_').replace('-', '_')\n",
    "\n",
    "# License boilerplate\n",
    "license_boilerplate = \"\"\"/*\n",
    " * Licensed to the Apache Software Foundation (ASF) under one\n",
    " * or more contributor license agreements.  See the NOTICE file\n",
    " * distributed with this work for additional information\n",
    " * regarding copyright ownership.  The ASF licenses this file\n",
    " * to you under the Apache License, Version 2.0 (the\n",
    " * \"License\"); you may not use this file except in compliance\n",
    " * with the License.  You may obtain a copy of the License at\n",
    " *\n",
    " *   http://www.apache.org/licenses/LICENSE-2.0\n",
    " *\n",
    " * Unless required by applicable law or agreed to in writing,\n",
    " * software distributed under the License is distributed on an\n",
    " * \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
    " * KIND, either express or implied.  See the License for the\n",
    " * specific language governing permissions and limitations\n",
    " * under the License.\n",
    " */\n",
    "\"\"\"\n",
    "\n",
    "# Generate TypeScript import statements\n",
    "imports = \"\\n\".join([f\"import {to_js_identifier(country)} from './countries/{to_js_identifier(country)}.geojson';\" for country in countries])\n",
    "\n",
    "# Generate the export object\n",
    "exports = \"export const countries = {\\n  \" + \",\\n  \".join([to_js_identifier(country) for country in countries]) + \",\\n};\"\n",
    "\n",
    "# Additional exports\n",
    "additional_exports = \"\"\"\n",
    "export const countryOptions = Object.keys(countries).map(x => {\n",
    "  if (x === 'uk' || x === 'usa') {\n",
    "    return [x, x.toUpperCase()];\n",
    "  }\n",
    "  if (x === 'italy_regions') {\n",
    "    return [x, 'Italy (regions)'];\n",
    "  }\n",
    "  if (x === 'france_regions') {\n",
    "    return [x, 'France (regions)'];\n",
    "  }\n",
    "  if (x === 'turkey_regions') {\n",
    "    return [x, 'Turkey (regions)'];\n",
    "  }\n",
    "  return [\n",
    "    x,\n",
    "    x\n",
    "      .split('_')\n",
    "      .map(e => e[0].toUpperCase() + e.slice(1))\n",
    "      .join(' '),\n",
    "  ];\n",
    "});\n",
    "\n",
    "export default countries;\n",
    "\"\"\"\n",
    "\n",
    "# Combine license, imports, exports, and additional exports\n",
    "typescript_code = f\"{license_boilerplate}\\n{imports}\\n\\n{exports}\\n{additional_exports}\"\n",
    "\n",
    "# Write to a file\n",
    "with open(\"../src/countries.ts\", \"w\") as file:\n",
    "    file.write(typescript_code)\n",
    "\n",
    "print(\"TypeScript code written to src/countries.ts\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  },
  "vscode": {
   "interpreter": {
    "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}