Skip to content

Commit

Permalink
♻️ deal again with pycountry
Browse files Browse the repository at this point in the history
track turkey in a custom pycountry instance and using the custom instance when searching countries
  • Loading branch information
bunop committed May 23, 2024
1 parent 95138b1 commit c96349e
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
"outputs": [],
"source": [
"import pandas as pd\n",
"import pycountry\n",
"\n",
"from geopy.geocoders import Nominatim\n",
"from geopy.extra.rate_limiter import RateLimiter\n",
"from tqdm.notebook import tqdm\n",
"\n",
"from src.features.utils import countries\n",
"from src.features.smarterdb import global_connection, Dataset"
]
},
Expand Down Expand Up @@ -216,7 +216,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d8df7ad85ccf44cd99f825483c245887",
"model_id": "c2cdae511e6b4e5dabf5f3a432398697",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -235,7 +235,7 @@
" data = rgeocode(coordinate, language=\"English\")\n",
" if data:\n",
" country_code = data.raw['address']['country_code']\n",
" return pycountry.countries.get(alpha_2=country_code).name\n",
" return countries.get(alpha_2=country_code).name\n",
" else:\n",
" return data\n",
"infos[\"country\"] = infos[\"coordinates\"].progress_apply(get_country)"
Expand Down Expand Up @@ -569,7 +569,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -583,7 +583,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
56 changes: 33 additions & 23 deletions notebooks/exploratory/0.22.0-bunop-new_sheep_background_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pycountry\n",
"from tqdm.notebook import tqdm\n",
"from geopy.geocoders import Nominatim\n",
"from geopy.extra.rate_limiter import RateLimiter\n",
Expand All @@ -37,6 +36,7 @@
"\n",
"from src.features.smarterdb import global_connection, Dataset, SampleSheep\n",
"from src.features.plinkio import TextPlinkIO, BinaryPlinkIO, CodingException\n",
"from src.features.utils import countries\n",
"from src.data.common import WORKING_ASSEMBLIES, AssemblyConf"
]
},
Expand Down Expand Up @@ -220,7 +220,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d8c53a08951644bc9dbbaa2e99016978",
"model_id": "9dc3f0261f4f4e28979cc0069b65cc8f",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -264,7 +264,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Got {'Llanwenog', 'Balwen', 'TregaronWelshMountain', 'Llawenog', 'TalybontWelshMountain', 'SouthWalesWelshMountain', 'Lleyn', 'LlandoveryWhiteFaced', 'DolgellauWelshMountain', 'ClunForest', 'WelshMountainHillFlock', 'BrecknockHillCheviot', 'HillRadnor', 'BadgerFaced', 'ImprovedWelshMountain', 'Beulah', 'KerryHill', 'HardySpeckledFaced', 'BlackWelshMountain'} breeds\n"
"Got {'Llanwenog', 'Beulah', 'Balwen', 'Lleyn', 'BadgerFaced', 'TregaronWelshMountain', 'WelshMountainHillFlock', 'ImprovedWelshMountain', 'KerryHill', 'ClunForest', 'HardySpeckledFaced', 'DolgellauWelshMountain', 'SouthWalesWelshMountain', 'LlandoveryWhiteFaced', 'TalybontWelshMountain', 'Llawenog', 'BlackWelshMountain', 'BrecknockHillCheviot', 'HillRadnor'} breeds\n"
]
}
],
Expand Down Expand Up @@ -310,7 +310,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'Llanwenog': 'Llanwenog', 'Balwen': 'Balwen', 'TregaronWelshMountain': 'Tregaron Welsh Mountain', 'Llawenog': 'Llawenog', 'TalybontWelshMountain': 'Talybont Welsh Mountain', 'SouthWalesWelshMountain': 'South Wales Welsh Mountain', 'Lleyn': 'Lleyn', 'LlandoveryWhiteFaced': 'Llandovery White Faced', 'DolgellauWelshMountain': 'Dolgellau Welsh Mountain', 'ClunForest': 'Clun Forest', 'WelshMountainHillFlock': 'Welsh Mountain Hill Flock', 'BrecknockHillCheviot': 'Brecknock Hill Cheviot', 'HillRadnor': 'Hill Radnor', 'BadgerFaced': 'Badger Faced', 'ImprovedWelshMountain': 'Improved Welsh Mountain', 'Beulah': 'Beulah', 'KerryHill': 'Kerry Hill', 'HardySpeckledFaced': 'Hardy Speckled Faced', 'BlackWelshMountain': 'Black Welsh Mountain'}\n"
"{'Llanwenog': 'Llanwenog', 'Beulah': 'Beulah', 'Balwen': 'Balwen', 'Lleyn': 'Lleyn', 'BadgerFaced': 'Badger Faced', 'TregaronWelshMountain': 'Tregaron Welsh Mountain', 'WelshMountainHillFlock': 'Welsh Mountain Hill Flock', 'ImprovedWelshMountain': 'Improved Welsh Mountain', 'KerryHill': 'Kerry Hill', 'ClunForest': 'Clun Forest', 'HardySpeckledFaced': 'Hardy Speckled Faced', 'DolgellauWelshMountain': 'Dolgellau Welsh Mountain', 'SouthWalesWelshMountain': 'South Wales Welsh Mountain', 'LlandoveryWhiteFaced': 'Llandovery White Faced', 'TalybontWelshMountain': 'Talybont Welsh Mountain', 'Llawenog': 'Llawenog', 'BlackWelshMountain': 'Black Welsh Mountain', 'BrecknockHillCheviot': 'Brecknock Hill Cheviot', 'HillRadnor': 'Hill Radnor'}\n"
]
}
],
Expand Down Expand Up @@ -469,7 +469,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c47e48b6f4d947708be1fdbad12e34f0",
"model_id": "7b3512fc33c84582bf02223c7988bd43",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -528,7 +528,7 @@
" 7 Ho 8 non-null float64\n",
" 8 Ho (SD) 8 non-null float64\n",
"dtypes: float64(4), int64(1), object(4)\n",
"memory usage: 704.0+ bytes\n"
"memory usage: 708.0+ bytes\n"
]
}
],
Expand Down Expand Up @@ -1668,7 +1668,7 @@
" 7 Source 16 non-null object \n",
" 8 Ho 16 non-null float64\n",
"dtypes: float64(3), int64(2), object(4)\n",
"memory usage: 1.2+ KB\n"
"memory usage: 1.3+ KB\n"
]
}
],
Expand Down Expand Up @@ -2498,7 +2498,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c77eb62bceae401083f1e2b3e062d264",
"model_id": "ad74d746c63a4d829b674d9994b94477",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -2900,7 +2900,7 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 910 entries, 0 to 909\n",
"RangeIndex: 910 entries, 0 to 909\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
Expand All @@ -2925,7 +2925,7 @@
" 18 country_old 910 non-null object \n",
" 19 region_old 856 non-null object \n",
"dtypes: float64(7), int64(1), object(12)\n",
"memory usage: 149.3+ KB\n"
"memory usage: 142.3+ KB\n"
]
}
],
Expand Down Expand Up @@ -3099,18 +3099,19 @@
{
"data": {
"text/plain": [
"breed\n",
"Croatian Isles 90\n",
"Gentile di Puglia 24\n",
"Bergamasca 24\n",
"Massese 24\n",
"Norwegian White 24\n",
"Xisqueta 24\n",
"Sardinian mouflon 24\n",
"Zel 24\n",
" ..\n",
"Dalmatian 4\n",
"Schoonebeker 4\n",
"Sumavska 4\n",
"Recka 2\n",
"Dubska 2\n",
"Privorska 2\n",
"Name: breed, Length: 61, dtype: int64"
"Dubska 2\n",
"Name: count, Length: 61, dtype: int64"
]
},
"execution_count": 49,
Expand Down Expand Up @@ -3139,10 +3140,11 @@
{
"data": {
"text/plain": [
"breed\n",
"Sardinian mouflon 24\n",
"European mouflon 21\n",
"Iranian mouflon 14\n",
"Name: breed, dtype: int64"
"Name: count, dtype: int64"
]
},
"execution_count": 50,
Expand Down Expand Up @@ -3610,7 +3612,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0bbde783cce047268ce5a825f40390a7",
"model_id": "5be71ec1425a4826a26d5c0cf32d3c02",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -4090,7 +4092,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ebac2f58b31f4a678a1b56dfb6bf3e73",
"model_id": "93f177b8cbdb461487a74860ece64daa",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -4258,7 +4260,7 @@
" data = rgeocode(coordinate, language=\"English\")\n",
" if data:\n",
" country_code = data.raw['address']['country_code']\n",
" return pycountry.countries.get(alpha_2=country_code).name\n",
" return countries.get(alpha_2=country_code).name\n",
" else:\n",
" return data\n",
"\n",
Expand Down Expand Up @@ -4788,7 +4790,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "46f9b1f0b5e541f3b320a0fba510b53f",
"model_id": "5bbd4402a08b4a56a8565e309549eb39",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -4966,6 +4968,14 @@
"id": "2b340aaf-5749-4f2e-ba54-348c185e5313",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_165225/3722076925.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
" gaouar_2017_sheeps.fillna(method='ffill', axis=0, inplace=True)\n"
]
},
{
"data": {
"text/html": [
Expand Down Expand Up @@ -6546,7 +6556,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -6560,7 +6570,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions notebooks/results/0.2.0-bunop-smarter_summary.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@
"import matplotlib.pyplot as plt\n",
"from shapely.geometry import Point\n",
"import geopandas as gpd\n",
"import pycountry\n",
"from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
"\n",
"from src.features.smarterdb import global_connection, Dataset, SampleSheep, SampleGoat, Country\n",
"from src.features.utils import countries\n",
"\n",
"conn = global_connection()\n",
"\n",
Expand Down Expand Up @@ -529,7 +529,7 @@
"def fix_iso_a3(name, iso_a3):\n",
" if iso_a3 == '-99':\n",
" try:\n",
" return pycountry.countries.search_fuzzy(name.split()[-1])[0].alpha_3\n",
" return countries.search_fuzzy(name.split()[-1])[0].alpha_3\n",
" except LookupError:\n",
" return \"-99\"\n",
" else:\n",
Expand Down
2 changes: 2 additions & 0 deletions src/data/import_breeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def main(species_class, src_dataset, dst_dataset, datafile, code_column,
# dealing with original file
alias = BreedAlias(fid=fid, dataset=dst_dataset, country=country)

logger.debug(f"Got alias: {alias}")

try:
breed, modified = get_or_create_breed(
species_class=species_class.capitalize(),
Expand Down
5 changes: 2 additions & 3 deletions src/data/import_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
MutuallyExclusiveOptionGroup)
from mongoengine.errors import DoesNotExist

import pycountry
from pandas.core.series import Series

from src.data.common import (
Expand All @@ -31,7 +30,7 @@
from src.features.smarterdb import (
global_connection, Breed, get_or_create_sample, get_sample_type,
SmarterDBException)
from src.features.utils import UnknownCountry
from src.features.utils import UnknownCountry, countries

logger = logging.getLogger(__name__)

Expand All @@ -53,7 +52,7 @@ def find_country(country: str):
return UnknownCountry()

# transform country string with pycountry
fuzzy = pycountry.countries.search_fuzzy(country)[0]
fuzzy = countries.search_fuzzy(country)[0]

logger.info(f"Found {fuzzy} for {country}")

Expand Down
7 changes: 3 additions & 4 deletions src/features/smarterdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import os
import logging
import pathlib
import pycountry
import mongoengine

from enum import Enum
Expand All @@ -18,7 +17,7 @@
from pymongo import database, ReturnDocument, MongoClient
from dotenv import find_dotenv, load_dotenv

from .utils import get_project_dir, UnknownCountry
from .utils import get_project_dir, UnknownCountry, countries

SPECIES2CODE = {
"Sheep": "OA",
Expand Down Expand Up @@ -192,7 +191,7 @@ def __init__(self, name: str = None, *args, **kwargs):
if name.lower() == "unknown":
country = UnknownCountry()
else:
country = pycountry.countries.get(name=name)
country = countries.get(name=name)

if country:
self.alpha_2 = country.alpha_2
Expand Down Expand Up @@ -530,7 +529,7 @@ def getSmarterId(
if country.lower() == "unknown":
country = UnknownCountry()
else:
country = pycountry.countries.get(name=country)
country = countries.get(name=country)

# get two letter code for country
country_code = country.alpha_2
Expand Down
6 changes: 6 additions & 0 deletions src/features/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,15 @@
import pathlib
import collections

from pycountry import countries

# Get an instance of a logger
logger = logging.getLogger(__name__)

# manage custom countries
# english name for turkey
countries.add_entry(alpha_2="TR", alpha_3="TUR", name="Turkey", numeric="792", official_name='Republic of Türkiye')


def sanitize(
word: str,
Expand Down

0 comments on commit c96349e

Please sign in to comment.