From d652f55e70c93d0b6939d20f1e955b2b4f422a2c Mon Sep 17 00:00:00 2001 From: Paolo Cozzi Date: Thu, 23 May 2024 18:12:59 +0200 Subject: [PATCH] :bug: fix country for french dataset using reverse geocoding to determine country of origin of french sheeps --- HISTORY.rst | 1 + Makefile | 2 +- .../0.8.0-bunop-describe-French-sheep.ipynb | 297 ++++++++++++------ 3 files changed, 196 insertions(+), 104 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index a034ba25..f2058543 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -16,6 +16,7 @@ TODO 0.4.10.dev0 ----------- +* Fix *french sheep* dataset country of origin using reverse geocoding (`#112 `__) * Manage python packages with `poetry `__ (`#128 `__) * Add data for Guisandesa goats (`#117 `) * Rename ``manifacturer`` into ``manufacturer`` diff --git a/Makefile b/Makefile index 391efc30..373564cf 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ data: requirements ## load breeds into database relying on dataset $(PYTHON_INTERPRETER) src/data/import_breeds.py --species_class Sheep --src_dataset="High density genotypes of French Sheep populations.zip" \ - --datafile Populations_infos_fix.xlsx --code_column Code --breed_column "Population Name" + --datafile Populations_infos_fix.xlsx --code_column Code --breed_column "Population Name" --country_column Country $(PYTHON_INTERPRETER) src/data/import_breeds.py --species_class Sheep --src_dataset=ovine_SNP50HapMap_data.zip \ --datafile ovine_SNP50HapMap_data/kijas2012_dataset_fix.xlsx --code_column code --breed_column Breed \ --fid_column Breed --country_column country diff --git a/notebooks/exploratory/0.8.0-bunop-describe-French-sheep.ipynb b/notebooks/exploratory/0.8.0-bunop-describe-French-sheep.ipynb index c93a2ec7..9d7eeae9 100644 --- a/notebooks/exploratory/0.8.0-bunop-describe-French-sheep.ipynb +++ b/notebooks/exploratory/0.8.0-bunop-describe-French-sheep.ipynb @@ -16,16 +16,18 @@ "metadata": {}, "outputs": [], "source": [ - "import io\n", "import csv\n", "import itertools\n", "\n", "from collections import Counter\n", - "from pathlib import Path\n", "\n", "import pandas as pd\n", "from plinkio import plinkfile\n", + "from geopy.geocoders import Nominatim\n", + "from geopy.extra.rate_limiter import RateLimiter\n", + "from tqdm.notebook import tqdm\n", "\n", + "from src.features.utils import countries\n", "from src.features.smarterdb import VariantSheep, global_connection, Dataset" ] }, @@ -45,7 +47,8 @@ "outputs": [], "source": [ "global_connection()\n", - "dataset = Dataset.objects.filter(file=\"High density genotypes of French Sheep populations.zip\").get()" + "dataset = Dataset.objects.filter(file=\"High density genotypes of French Sheep populations.zip\").get()\n", + "tqdm.pandas()" ] }, { @@ -66,11 +69,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.bed\n", + "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/Populations_infos.xlsx\n", "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/info.txt\n", + "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/OAR3\n", "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.bim\n", - "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.fam\n", - "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/Populations_infos.xlsx\n" + "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/Populations_infos_fix.xlsx\n", + "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.bed\n", + "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/OAR4\n", + "/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.fam\n" ] } ], @@ -141,7 +147,6 @@ " Color\n", " POP_GROUP_CODE\n", " POP_GROUP_NAME\n", - " Unnamed: 8\n", " \n", " \n", " \n", @@ -155,7 +160,6 @@ " NaN\n", " NORTH\n", " NORTH\n", - " NaN\n", " \n", " \n", " 1\n", @@ -167,7 +171,6 @@ " NaN\n", " SOUTH\n", " SOUTH\n", - " NaN\n", " \n", " \n", " 2\n", @@ -179,7 +182,6 @@ " NaN\n", " SOUTH\n", " SOUTH\n", - " NaN\n", " \n", " \n", " 3\n", @@ -191,7 +193,6 @@ " NaN\n", " NORTH\n", " NORTH\n", - " NaN\n", " \n", " \n", " 4\n", @@ -203,7 +204,6 @@ " NaN\n", " NORTH\n", " NORTH\n", - " NaN\n", " \n", " \n", "\n", @@ -224,12 +224,12 @@ "3 http://en.france-genetique-elevage.org/Charoll... 46.435442 4.277004 \n", "4 http://en.france-genetique-elevage.org/Charmoi... 47.390249 1.254324 \n", "\n", - " Color POP_GROUP_CODE POP_GROUP_NAME Unnamed: 8 \n", - "0 NaN NORTH NORTH NaN \n", - "1 NaN SOUTH SOUTH NaN \n", - "2 NaN SOUTH SOUTH NaN \n", - "3 NaN NORTH NORTH NaN \n", - "4 NaN NORTH NORTH NaN " + " Color POP_GROUP_CODE POP_GROUP_NAME \n", + "0 NaN NORTH NORTH \n", + "1 NaN SOUTH SOUTH \n", + "2 NaN SOUTH SOUTH \n", + "3 NaN NORTH NORTH \n", + "4 NaN NORTH NORTH " ] }, "execution_count": 5, @@ -277,8 +277,7 @@ " counter.update([variant.name])\n", " location = next(filter(lambda loc: loc.imported_from == \"SNPchiMp v.3\", variant.locations))\n", " if line[0] != location.chrom or int(line[3]) != location.position:\n", - " print(f\"snp {line[1]} with different positions: {line[0]}:{line[3]}<>{location.chrom}:{location.position}\")\n", - " " + " print(f\"snp {line[1]} with different positions: {line[0]}:{line[3]}<>{location.chrom}:{location.position}\")\n" ] }, { @@ -369,138 +368,138 @@ " \n", " \n", " 0\n", - " CDL\n", - " Causses du Lot\n", + " IDF\n", + " Île de France\n", " \n", " \n", " 1\n", - " TEX\n", - " Texel\n", + " ROM\n", + " Romanov\n", " \n", " \n", " 2\n", - " SUF\n", - " Suffolk\n", + " CDL\n", + " Causses du Lot\n", " \n", " \n", " 3\n", - " CHR\n", - " Charmoise\n", + " SUF\n", + " Suffolk\n", " \n", " \n", " 4\n", - " ROM\n", - " Romanov\n", + " LAC\n", + " Lacaune (milk)\n", " \n", " \n", " 5\n", - " COR\n", - " Corse\n", + " MER\n", + " Mérinos d'Arles\n", " \n", " \n", " 6\n", - " RWE\n", - " Rouge de l'Ouest\n", + " PAS\n", + " Préalpes du Sud\n", " \n", " \n", " 7\n", - " TAR\n", - " Tarasconnaise\n", + " MTR\n", + " Manech Tête Rouge\n", " \n", " \n", " 8\n", - " LAC\n", - " Lacaune (milk)\n", + " LAM\n", + " Lacaune (meat)\n", " \n", " \n", " 9\n", - " RAV\n", - " Rava\n", + " OUE\n", + " Ouessant\n", " \n", " \n", " 10\n", - " LAM\n", - " Lacaune (meat)\n", + " ROU\n", + " Roussin de la Hague\n", " \n", " \n", " 11\n", - " RAM\n", - " Mérinos de Rambouillet\n", + " LIM\n", + " Limousine\n", " \n", " \n", " 12\n", - " BER\n", - " Berrichon du Cher\n", + " CHA\n", + " Mouton Charollais\n", " \n", " \n", " 13\n", - " OUE\n", - " Ouessant\n", + " RMN\n", + " Romane\n", " \n", " \n", " 14\n", - " MOU\n", - " Mourerous\n", + " TAR\n", + " Tarasconnaise\n", " \n", " \n", " 15\n", - " BMC\n", - " Blanc du Massif Central\n", + " EUR\n", + " European Mouflon\n", " \n", " \n", " 16\n", - " LIM\n", - " Limousine\n", + " NVE\n", + " Noire du Velay\n", " \n", " \n", " 17\n", - " EUR\n", - " European Mouflon\n", + " MOU\n", + " Mourerous\n", " \n", " \n", " 18\n", - " NVE\n", - " Noire du Velay\n", + " COR\n", + " Corse\n", " \n", " \n", " 19\n", - " MER\n", - " Mérinos d'Arles\n", + " TEX\n", + " Texel\n", " \n", " \n", " 20\n", - " CHA\n", - " Mouton Charollais\n", + " RAM\n", + " Mérinos de Rambouillet\n", " \n", " \n", " 21\n", - " ROU\n", - " Roussin de la Hague\n", + " RAV\n", + " Rava\n", " \n", " \n", " 22\n", - " IDF\n", - " Île de France\n", + " RWE\n", + " Rouge de l'Ouest\n", " \n", " \n", " 23\n", - " PAS\n", - " Préalpes du Sud\n", + " CHR\n", + " Charmoise\n", " \n", " \n", " 24\n", - " VEN\n", - " Mouton Vendéen\n", + " BMC\n", + " Blanc du Massif Central\n", " \n", " \n", " 25\n", - " RMN\n", - " Romane\n", + " BER\n", + " Berrichon du Cher\n", " \n", " \n", " 26\n", - " MTR\n", - " Manech Tête Rouge\n", + " VEN\n", + " Mouton Vendéen\n", " \n", " \n", "\n", @@ -508,33 +507,33 @@ ], "text/plain": [ " Code Population Name\n", - "0 CDL Causses du Lot\n", - "1 TEX Texel\n", - "2 SUF Suffolk\n", - "3 CHR Charmoise\n", - "4 ROM Romanov\n", - "5 COR Corse\n", - "6 RWE Rouge de l'Ouest\n", - "7 TAR Tarasconnaise\n", - "8 LAC Lacaune (milk)\n", - "9 RAV Rava\n", - "10 LAM Lacaune (meat)\n", - "11 RAM Mérinos de Rambouillet\n", - "12 BER Berrichon du Cher\n", - "13 OUE Ouessant\n", - "14 MOU Mourerous\n", - "15 BMC Blanc du Massif Central\n", - "16 LIM Limousine\n", - "17 EUR European Mouflon\n", - "18 NVE Noire du Velay\n", - "19 MER Mérinos d'Arles\n", - "20 CHA Mouton Charollais\n", - "21 ROU Roussin de la Hague\n", - "22 IDF Île de France\n", - "23 PAS Préalpes du Sud\n", - "24 VEN Mouton Vendéen\n", - "25 RMN Romane\n", - "26 MTR Manech Tête Rouge" + "0 IDF Île de France\n", + "1 ROM Romanov\n", + "2 CDL Causses du Lot\n", + "3 SUF Suffolk\n", + "4 LAC Lacaune (milk)\n", + "5 MER Mérinos d'Arles\n", + "6 PAS Préalpes du Sud\n", + "7 MTR Manech Tête Rouge\n", + "8 LAM Lacaune (meat)\n", + "9 OUE Ouessant\n", + "10 ROU Roussin de la Hague\n", + "11 LIM Limousine\n", + "12 CHA Mouton Charollais\n", + "13 RMN Romane\n", + "14 TAR Tarasconnaise\n", + "15 EUR European Mouflon\n", + "16 NVE Noire du Velay\n", + "17 MOU Mourerous\n", + "18 COR Corse\n", + "19 TEX Texel\n", + "20 RAM Mérinos de Rambouillet\n", + "21 RAV Rava\n", + "22 RWE Rouge de l'Ouest\n", + "23 CHR Charmoise\n", + "24 BMC Blanc du Massif Central\n", + "25 BER Berrichon du Cher\n", + "26 VEN Mouton Vendéen" ] }, "execution_count": 9, @@ -546,11 +545,103 @@ "merged = pd.merge(fids,infos, on=\"Code\")\n", "merged.loc[:, [\"Code\", \"Population Name\"]]" ] + }, + { + "cell_type": "markdown", + "id": "7220409b", + "metadata": {}, + "source": [ + "## Check dataset countries\n", + "Test for sample countries of origin" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2172d87a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "516b2e83934c4e53ab5cf07b370a2684", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/27 [00:00