Skip to content

Commit

Permalink
🧐 add additional statistics to smarter summary
Browse files Browse the repository at this point in the history
  • Loading branch information
bunop committed Sep 10, 2024
1 parent 0884c0c commit b855cda
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 18 deletions.
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,6 @@
"tabix",
"tqdm",
"vcftools"
]
],
"makefile.configureOnOpen": false
}
119 changes: 103 additions & 16 deletions notebooks/results/0.2.0-bunop-smarter_summary.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"id": "3cc84f47-db82-43eb-99fd-78768ecd2a99",
"metadata": {},
"source": [
"# SMARTER SUMMARY (2024/08/31)\n",
"# SMARTER SUMMARY (2024/09/10)\n",
"* [Dataset composition](#datasets-composition)\n",
" * [Foreground / background datasets](#foreground-vs-background-datasets)\n",
" * [Datasets by chip type](#datasets-by-chip-type)\n",
Expand All @@ -27,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "c19ca423-c3d1-40e5-9e76-3b9779b322a5",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -187,7 +187,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "74aab105-c016-4a98-9d7d-c30fe7d90830",
"metadata": {},
"outputs": [],
Expand All @@ -196,7 +196,8 @@
"for chip_name in SampleSheep.objects.distinct(\"chip_name\"):\n",
" sheep_by_chip['chip_name'].append(chip_name)\n",
" sheep_by_chip['count'].append(SampleSheep.objects.filter(chip_name=chip_name).count())\n",
"sheep_by_chip = pd.DataFrame.from_dict(sheep_by_chip).set_index(\"chip_name\")"
"sheep_by_chip = pd.DataFrame.from_dict(sheep_by_chip).set_index(\"chip_name\")\n",
"sheep_by_chip.to_excel(\"sheep-samples-by-chip-type.xlsx\", index=True)"
]
},
{
Expand All @@ -213,6 +214,92 @@
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "a8c0be02",
"metadata": {},
"source": [
"Collect breed by chip type:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43f6e18b",
"metadata": {},
"outputs": [],
"source": [
"tmp = pd.DataFrame(data=json.loads(SampleSheep.objects.to_json()))\n",
"\n",
"# group by breed and chip_name and count the number of occurrences\n",
"grouped_counts = tmp.groupby(['breed', 'chip_name']).size().reset_index(name='count')\n",
"grouped_counts.to_excel(\"sheep-samples-by-breed-and-chip-type.xlsx\", index=False)\n",
"grouped_counts.head()"
]
},
{
"cell_type": "markdown",
"id": "534f2835",
"metadata": {},
"source": [
"Count how many breeds I have for chip type:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1cfedf4",
"metadata": {},
"outputs": [],
"source": [
"breed_counts_by_chip = grouped_counts['chip_name'].value_counts().reset_index()\n",
"breed_counts_by_chip.columns = ['chip_name', 'number_of_breeds']\n",
"breed_counts_by_chip.to_excel('breed_counts_by_chip.xlsx', index=False)\n",
"breed_counts_by_chip"
]
},
{
"cell_type": "markdown",
"id": "33a77374",
"metadata": {},
"source": [
"Count how many breeds have geographical coordinates:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1423f569",
"metadata": {},
"outputs": [],
"source": [
"tmp = pd.DataFrame(data=json.loads(SampleSheep.objects.filter(locations__exists=True).to_json()))\n",
"\n",
"# group by breed and chip_name and count the number of occurrences\n",
"grouped_counts_gps = tmp.groupby(['breed', 'chip_name']).size().reset_index(name='count')\n",
"grouped_counts_gps.to_excel(\"sheep-samples-by-breed-and-chip-type-with-gps.xlsx\", index=False)\n",
"grouped_counts_gps.head()"
]
},
{
"cell_type": "markdown",
"id": "55c49479",
"metadata": {},
"source": [
"Count how many samples have geographical coordinates by chip type:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc811530",
"metadata": {},
"outputs": [],
"source": [
"gps_sample_counts_by_chip = grouped_counts_gps.groupby('chip_name')['count'].sum().reset_index()\n",
"gps_sample_counts_by_chip"
]
},
{
"cell_type": "markdown",
"id": "0523c150-7b6a-4e1d-b3d3-a46d52b786c3",
Expand All @@ -225,7 +312,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "58353886-f4d7-4a7e-973d-45c29aef015a",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -270,7 +357,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"id": "324b5381-5ff1-4b4c-ba23-569d77b11300",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -355,7 +442,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"id": "cf065c7b-797d-495b-a69f-9638e92a0454",
"metadata": {},
"outputs": [],
Expand All @@ -374,7 +461,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"id": "0eabfc20-47bc-4de4-89a7-aebc569c70f1",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -462,7 +549,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"id": "ac4bff84-34c1-4526-898e-b6dfe93b1148",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -492,7 +579,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"id": "33dd743b",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -539,7 +626,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"id": "bfef48b4-efc2-4fee-b272-1d4cb6b486c1",
"metadata": {},
"outputs": [],
Expand All @@ -560,7 +647,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"id": "d2a1c513-e5c0-4398-850f-80ab0f5b0087",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -619,7 +706,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"id": "2102c053-8030-4878-8288-a0611c859e8a",
"metadata": {},
"outputs": [],
Expand All @@ -632,7 +719,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 31,
"id": "30b33efc-b611-45c0-96a4-edaca2072ef1",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -676,7 +763,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 33,
"id": "0d660d28",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -721,7 +808,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 35,
"id": "4a11f36e",
"metadata": {},
"outputs": [],
Expand Down
2 changes: 1 addition & 1 deletion src/features/plinkio.py
Original file line number Diff line number Diff line change
Expand Up @@ -1265,7 +1265,7 @@ def prefix(self, prefix: str):
self.plink_file = plinkfile.open(self._prefix)

def read_mapfile(self):
"""Read map data and track informations in memory. Useful to process
"""Read map data and track information in memory. Useful to process
data files"""

self.mapdata = list()
Expand Down

0 comments on commit b855cda

Please sign in to comment.