diff --git a/Makefile b/Makefile index 9507c28..0598120 100644 --- a/Makefile +++ b/Makefile @@ -92,6 +92,7 @@ data: requirements $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name Texel --code TEX --alias TEXEL_UY --dataset TEXEL_INIA_UY.zip $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name Frizarta --code FRZ --alias 0 --dataset Frizarta54samples_ped_map_files.zip $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name Merino --code MER --alias MERINO_UY --dataset MERINO_INIA_UY.zip + $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name Merino --code MER --alias MER --dataset MERINO_INIA_UY.zip $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name Corriedale --code CRR --alias CORRIEDALE_UY --dataset CORRIEDALE_INIA_UY.zip $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name Creole --code CRL --alias CRL --dataset CREOLE_INIA_UY.zip $(PYTHON_INTERPRETER) src/data/add_breed.py --species_class sheep --name "Mérinos d'Arles" --code ARL --alias MER --dataset="High density genotypes of French Sheep populations.zip" @@ -196,9 +197,9 @@ data: requirements $(foreach ASSEMBLY, $(SHEEP_ASSEMBLIES), $(PYTHON_INTERPRETER) src/data/import_from_plink.py --file Frizarta54samples_ped_map_files/Frizarta54samples \ --dataset Frizarta54samples_ped_map_files.zip --src_coding forward --chip_name IlluminaOvineSNP50 \ --assembly $(ASSEMBLY) --create_samples;) - $(foreach ASSEMBLY, $(SHEEP_ASSEMBLIES), $(PYTHON_INTERPRETER) src/data/import_from_plink.py --file MERINO_UY_96_21_12_17_OV54k \ - --dataset MERINO_INIA_UY.zip --chip_name IlluminaOvineSNP50 \ - --assembly $(ASSEMBLY) --create_samples;) + $(PYTHON_INTERPRETER) src/data/import_samples.py --src_dataset MERINO_INIA_UY.zip \ + --datafile MERINO_UY_96_21_12_17_OV54k_samples.xlsx --code_column code --id_column iid \ + --chip_name IlluminaOvineSNP50 --country_column country $(foreach ASSEMBLY, $(SHEEP_ASSEMBLIES), $(PYTHON_INTERPRETER) src/data/import_from_plink.py --file CORRIEDALE_UY_60_INIA_Ovine_14sep2010 \ --dataset CORRIEDALE_INIA_UY.zip --chip_name IlluminaOvineSNP50 \ --assembly $(ASSEMBLY) --create_samples;) @@ -336,6 +337,8 @@ data: requirements --chip_name IlluminaOvineSNP50 --country_column Country ## convert genotypes without creating samples in database (SHEEP) + $(foreach ASSEMBLY, $(SHEEP_ASSEMBLIES), $(PYTHON_INTERPRETER) src/data/import_from_plink.py --file MERINO_UY_96_21_12_17_OV54k \ + --dataset MERINO_INIA_UY.zip --chip_name IlluminaOvineSNP50 --assembly $(ASSEMBLY);) $(foreach ASSEMBLY, $(SHEEP_ASSEMBLIES), $(PYTHON_INTERPRETER) src/data/import_from_affymetrix.py --prefix Affymetrix_data_Plate_652_660/Affymetrix_data_Plate_652/Affymetrix_data_Plate_652 \ --dataset Affymetrix_data_Plate_652_660.zip --breed_code CRR --chip_name AffymetrixAxiomOviCan --assembly $(ASSEMBLY) --sample_field alias \ --src_version Oar_v4.0 --src_imported_from affymetrix;) diff --git a/notebooks/exploratory/0.9.0-bunop-describe-MERINO_INIA.ipynb b/notebooks/exploratory/0.9.0-bunop-describe-MERINO_INIA.ipynb index 2710a70..79eb57c 100644 --- a/notebooks/exploratory/0.9.0-bunop-describe-MERINO_INIA.ipynb +++ b/notebooks/exploratory/0.9.0-bunop-describe-MERINO_INIA.ipynb @@ -17,15 +17,15 @@ "outputs": [], "source": [ "import io\n", - "import csv\n", "import itertools\n", + "import pandas as pd\n", "\n", - "from collections import Counter\n", "from zipfile import ZipFile\n", - "from pathlib import Path\n", "\n", "import src.features.illumina\n", - "from src.features.smarterdb import VariantSheep, global_connection" + "from src.features.smarterdb import VariantSheep, global_connection, Dataset, Breed\n", + "from src.features.utils import get_project_dir\n", + "from src.features.plinkio import TextPlinkIO" ] }, { @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "project_dir = Path.cwd().parents[1]\n", + "project_dir = get_project_dir()\n", "datafile = project_dir / \"data/raw/background/MERINO_INIA_UY.zip\"" ] }, @@ -128,12 +128,32 @@ "execution_count": 6, "id": "upper-judgment", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Skipping: Illumina, Inc.,,,,,,,,,,,,,,,,,,,\n", + "Skipping: [Heading],,,,,,,,,,,,,,,,,,,,\n", + "Skipping: Descriptor File Name,OvineSNP50_B.bpm,,,,,,,,,,,,,,,,,,,\n", + "Skipping: Assay Format,Infinium HD Ultra,,,,,,,,,,,,,,,,,,,\n", + "Skipping: Date Manufactured,1/7/2009,,,,,,,,,,,,,,,,,,,\n", + "Skipping: Loci Count ,54241,,,,,,,,,,,,,,,,,,,\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Skipping: [Assay],,,,,,,,,,,,,,,,,,,,\n" + ] + } + ], "source": [ "chip_dir = project_dir / \"data/external/SHE/ILLUMINA/\"\n", - "old_chip3_file = chip_dir / \"ovinesnp50_b.csv\"\n", + "old_chip3_file = chip_dir / \"ovinesnp50_b.csv.gz\"\n", "old_chip3 = dict()\n", - "for record in src.features.illumina.read_Manifest(old_chip3_file):\n", + "for record in src.features.illumina.read_Manifest(old_chip3_file, delimiter=\",\"):\n", " old_chip3[record.name] = (record.chr, record.mapinfo)" ] }, @@ -160,15 +180,15 @@ "missing = 0\n", "\n", "for key, value in old_chip3.items():\n", - " if not key in data_coordinates:\n", + " if key not in data_coordinates:\n", " missing += 1\n", " continue\n", - " \n", + "\n", " if value != data_coordinates[key]:\n", " count += 1\n", " if count <= 10:\n", " print(key, value, data_coordinates[key])\n", - " \n", + "\n", "print(f\"\\nN of SNPs in different positions in merino and old chip3: {count}\")\n", "print(f\"\\nN of SNPs in merino not in chip: {missing}\")" ] @@ -196,11 +216,11 @@ "snp 250506CS3900176800001_906.1 with different positions: 7:89002990 <> 7:81648528\n", "snp 250506CS3900211600001_1041.1 with different positions: 16:44955568 <> 16:41355381\n", "snp 250506CS3900218700001_1294.1 with different positions: 2:157820235 <> 2:148802744\n", - "snp 250506CS3900283200001_442.1 with different positions: 1:203289635 <> 99:0\n", + "snp 250506CS3900283200001_442.1 with different positions: 1:203289635 <> 0:0\n", "snp 250506CS3900371000001_1255.1 with different positions: 11:37632867 <> 11:35339123\n", "snp 250506CS3900386000001_696.1 with different positions: 16:68297712 <> 16:62646307\n", "snp 250506CS3900414400001_1178.1 with different positions: 1:111100644 <> 1:103396552\n", - "snp 250506CS3900435700001_1658.1 with different positions: 12:50140951 <> 99:0\n", + "snp 250506CS3900435700001_1658.1 with different positions: 12:50140951 <> 0:0\n", "snp 250506CS3900464100001_519.1 with different positions: 1:91075445 <> 1:85767398\n", "snp 250506CS3900487100001_1521.1 with different positions: 14:1552575 <> 14:1110363\n", "snp 250506CS3900539000001_471.1 with different positions: X:74622875 <> X:115765957\n", @@ -209,8 +229,7 @@ "snp CL635241_413.1 with different positions: 3:196207011 <> 3:182202867\n", "snp CL635750_128.1 with different positions: 3:242198228 <> 3:223741135\n", "snp CL635944_160.1 with different positions: 0:0 <> 6:114778683\n", - "snp Contig35697_5761.1 with different positions: 0:0 <> 6:18835475\n", - "snp CR_594.1 with different positions: 0:0 <> 99:0\n" + "snp Contig35697_5761.1 with different positions: 0:0 <> 6:18835475\n" ] } ], @@ -218,7 +237,7 @@ "global_connection()\n", "for key, value in itertools.islice(data_coordinates.items(), 20):\n", " qs = VariantSheep.objects(name=key)\n", - " \n", + "\n", " if qs.count() > 0:\n", " variant = qs.get()\n", " location = next(filter(lambda loc: loc.imported_from == \"SNPchiMp v.3\", variant.locations))\n", @@ -226,10 +245,643 @@ " print(f\"snp {key} with different positions: {value[0]}:{value[1]} <> {location.chrom}:{location.position}\")" ] }, + { + "cell_type": "markdown", + "id": "aware-logging", + "metadata": {}, + "source": [ + "## Fix metadata\n", + "\n", + "There are some animals which are imported (have GPS coordinates not in uruguay). So read from metadata and write a new metadata samples with the proper country. First, get samples and breed from ped file:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "94f16289", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = Dataset.objects.get(file=\"MERINO_INIA_UY.zip\")\n", + "plinkio = TextPlinkIO(\n", + " prefix=str(dataset.working_dir / \"MERINO_UY_96_21_12_17_OV54k\"),\n", + " species=dataset.species,\n", + " chip_name=dataset.chip_name)" + ] + }, + { + "cell_type": "markdown", + "id": "89133e3c", + "metadata": {}, + "source": [ + "Now get `fid` and `iid`:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d6579319", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['MERINO_UY', '201711200001'],\n", + " ['MERINO_UY', '201711200002'],\n", + " ['MERINO_UY', '201711200003'],\n", + " ['MERINO_UY', '201711200004'],\n", + " ['MERINO_UY', '201711200005'],\n", + " ['MERINO_UY', '201711200006'],\n", + " ['MERINO_UY', '201711200007'],\n", + " ['MERINO_UY', '201711200008'],\n", + " ['MERINO_UY', '201711200009'],\n", + " ['MERINO_UY', '201711200010']]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = [[fid, iid] for fid, iid, *_ in plinkio.read_pedfile()]\n", + "samples[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "76efc585", + "metadata": {}, + "source": [ + "Ok, now I need to open the proper metadata file, in order to select the animal I need:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f33c25d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDBreedStallGPS_CoordinatesGPS_2
01CorriedaleCIEDAG-33.86937579589417, -55.57265008365528https://www.google.com/maps/place/CIEDAG+-+Sec...
110CorriedaleCIEDAG-33.86937579589417, -55.57265008365528https://www.google.com/maps/place/CIEDAG+-+Sec...
211CorriedaleCIEDAG-33.86937579589417, -55.57265008365528https://www.google.com/maps/place/CIEDAG+-+Sec...
312CorriedaleCIEDAG-33.86937579589417, -55.57265008365528https://www.google.com/maps/place/CIEDAG+-+Sec...
413CorriedaleCIEDAG-33.86937579589417, -55.57265008365528https://www.google.com/maps/place/CIEDAG+-+Sec...
\n", + "
" + ], + "text/plain": [ + " ID Breed Stall GPS_Coordinates \\\n", + "0 1 Corriedale CIEDAG -33.86937579589417, -55.57265008365528 \n", + "1 10 Corriedale CIEDAG -33.86937579589417, -55.57265008365528 \n", + "2 11 Corriedale CIEDAG -33.86937579589417, -55.57265008365528 \n", + "3 12 Corriedale CIEDAG -33.86937579589417, -55.57265008365528 \n", + "4 13 Corriedale CIEDAG -33.86937579589417, -55.57265008365528 \n", + "\n", + " GPS_2 \n", + "0 https://www.google.com/maps/place/CIEDAG+-+Sec... \n", + "1 https://www.google.com/maps/place/CIEDAG+-+Sec... \n", + "2 https://www.google.com/maps/place/CIEDAG+-+Sec... \n", + "3 https://www.google.com/maps/place/CIEDAG+-+Sec... \n", + "4 https://www.google.com/maps/place/CIEDAG+-+Sec... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata = Dataset.objects.get(file=\"Smarter_Ids_Uploaded_with_GPSCordinates_FINAL.zip\")\n", + "df = pd.read_excel(metadata.working_dir / \"Smarter_Ids_Uploaded_with_GPSCordinates_FINAL.xlsx\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f68fd0af", + "metadata": {}, + "source": [ + "Ok now I need to select from metadata the sample I have in this dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3998bd90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fidiid
0MERINO_UY201711200001
1MERINO_UY201711200002
2MERINO_UY201711200003
3MERINO_UY201711200004
4MERINO_UY201711200005
\n", + "
" + ], + "text/plain": [ + " fid iid\n", + "0 MERINO_UY 201711200001\n", + "1 MERINO_UY 201711200002\n", + "2 MERINO_UY 201711200003\n", + "3 MERINO_UY 201711200004\n", + "4 MERINO_UY 201711200005" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = pd.DataFrame(samples, columns=[\"fid\", \"iid\"])\n", + "samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b651383f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of samples in the metadata: 96\n", + "Number of samples in the plink file: 96\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDBreedStallGPS_CoordinatesGPS_2
158201711200065MerinoEEFAS-31.38766026946753, -57.71601394419916https://earth.google.com/web/search/Estacion+E...
159201711200066MerinoEEFAS-31.38766026946753, -57.71601394419916https://earth.google.com/web/search/Estacion+E...
160201711200005MerinoIMPORTADO-30.945528469563605, 151.2477073694806https://www.google.com/maps/place/Nerstane+Mer...
161201711200012MerinoIMPORTADO-30.945528469563605, 151.2477073694806https://www.google.com/maps/place/Nerstane+Mer...
162201711200013MerinoIMPORTADO-30.945528469563605, 151.2477073694806https://www.google.com/maps/place/Nerstane+Mer...
\n", + "
" + ], + "text/plain": [ + " ID Breed Stall GPS_Coordinates \\\n", + "158 201711200065 Merino EEFAS -31.38766026946753, -57.71601394419916 \n", + "159 201711200066 Merino EEFAS -31.38766026946753, -57.71601394419916 \n", + "160 201711200005 Merino IMPORTADO -30.945528469563605, 151.2477073694806 \n", + "161 201711200012 Merino IMPORTADO -30.945528469563605, 151.2477073694806 \n", + "162 201711200013 Merino IMPORTADO -30.945528469563605, 151.2477073694806 \n", + "\n", + " GPS_2 \n", + "158 https://earth.google.com/web/search/Estacion+E... \n", + "159 https://earth.google.com/web/search/Estacion+E... \n", + "160 https://www.google.com/maps/place/Nerstane+Mer... \n", + "161 https://www.google.com/maps/place/Nerstane+Mer... \n", + "162 https://www.google.com/maps/place/Nerstane+Mer... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['ID'] = df['ID'].astype(str)\n", + "df_selected = df[df['ID'].isin(samples['iid'])]\n", + "print(f\"Number of samples in the metadata: {len(df_selected)}\")\n", + "print(f\"Number of samples in the plink file: {len(samples)}\")\n", + "df_selected.head()" + ] + }, + { + "cell_type": "markdown", + "id": "52cd9927", + "metadata": {}, + "source": [ + "Select the proper code for this breed:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e9b2a503", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'MER'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breed = Breed.objects.get(name=\"Merino\", species=\"Sheep\")\n", + "breed.code" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d7279831", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fidiidcodecountry
0MERINO_UY201711200001MERUruguay
1MERINO_UY201711200002MERUruguay
2MERINO_UY201711200003MERUruguay
3MERINO_UY201711200004MERUruguay
4MERINO_UY201711200005MERUruguay
\n", + "
" + ], + "text/plain": [ + " fid iid code country\n", + "0 MERINO_UY 201711200001 MER Uruguay\n", + "1 MERINO_UY 201711200002 MER Uruguay\n", + "2 MERINO_UY 201711200003 MER Uruguay\n", + "3 MERINO_UY 201711200004 MER Uruguay\n", + "4 MERINO_UY 201711200005 MER Uruguay" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"code\"] = breed.code\n", + "samples[\"country\"] = \"Uruguay\"\n", + "samples.head()" + ] + }, + { + "cell_type": "markdown", + "id": "18c2f44f", + "metadata": {}, + "source": [ + "Now select all the animals with `IMPORTADO` `Stall` and change country to `Australia`:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5891432c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "country\n", + "Uruguay 84\n", + "Australia 12\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fidiidcodecountry
0MERINO_UY201711200001MERUruguay
1MERINO_UY201711200002MERUruguay
2MERINO_UY201711200003MERUruguay
3MERINO_UY201711200004MERUruguay
4MERINO_UY201711200005MERAustralia
\n", + "
" + ], + "text/plain": [ + " fid iid code country\n", + "0 MERINO_UY 201711200001 MER Uruguay\n", + "1 MERINO_UY 201711200002 MER Uruguay\n", + "2 MERINO_UY 201711200003 MER Uruguay\n", + "3 MERINO_UY 201711200004 MER Uruguay\n", + "4 MERINO_UY 201711200005 MER Australia" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples.loc[samples['iid'].isin(df_selected[df_selected[\"Stall\"] == \"IMPORTADO\"]['ID']), 'country'] = 'Australia'\n", + "print(samples[\"country\"].value_counts())\n", + "samples.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2e76f209", + "metadata": {}, + "source": [ + "Write this metadata to a new file:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e0046755", + "metadata": {}, + "outputs": [], + "source": [ + "samples.to_excel(metadata.working_dir / \"MERINO_UY_96_21_12_17_OV54k_samples.xlsx\", index=False)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "aware-logging", + "id": "b9a1de9d", "metadata": {}, "outputs": [], "source": [] @@ -237,7 +889,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -251,7 +903,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.12.3" } }, "nbformat": 4,