chembl · louwenjjr · Jul 18, 2024
diff --git a/README.md b/README.md
@@ -129,7 +129,7 @@ Bucket 3 (-) Targets with PROTAC in clinical phase 1
 Bucket 4 (Europe PMC) Targets mentioned in PROTAC literature (manually curated) + additional information via automated assessments
 Bucket 5 (UniProt) Targets with UniProt keyword: "Ubl conjugation [KW-0832]"  
 Bucket 6 (PhosphoSitePlus, mUbiSiDa, Kim et al. 2011) Targets with reported ubiquitination sites in PhosphoSitePlus, mUbiSiDa (2013), or [Kim et al. 2011](https://www.sciencedirect.com/science/article/pii/S1097276511006757)  
-Bucket 7 ([Mathieson et al. 2018](https://www.nature.com/articles/s41467-018-03106-1)) Targets with available half-life data  
+Bucket 7 ([Mathieson et al. 2018](https://www.nature.com/articles/s41467-018-03106-1) and [Rolfs et al. 2021](https://www.nature.com/articles/s41467-021-26842-3)) Targets with available half-life data  
 Bucket 8 (ChEMBL) Targets with SM in ChEMBL with activity on target-based assay (pChEMBL ≥5) 
 
 PROTAC_location_Bucket  
@@ -234,7 +234,7 @@ Column 'Uniprot_PTM' contains additional information on posttranslational modifi
 Column 'Uniprot_CrossLink' contains additional information on crosslinks from UniProt and is not associated with a bucket.  
 Columns 'Ub_PhosphoSitePlus', 'Ub_mUbiSiDa_2013' and 'number_of_ubiquitination_sites' relate to bucket 6 and represent information available 
 from the two databases PhosphoSitePlus and mUbiSiDa and the dataset from Kim et al., respectively.  
-Columns 'Max_halflife' and 'Min_halflife' contain the max/min values of half-life data means measured in different cell types from Mathieson et al. and provides more detail to bucket 7 assessment. 
+Columns 'Max_halflife' and 'Min_halflife' contain the max/min values of half-life data means measured in different cell types from Mathieson et al. and Rolfs et al. and provides more detail to bucket 7 assessment. 
 The available data is provided in columns 'Bcell_mean', 'NKcell_mean', 'Hepatocytes_mean', 'MouseNeuorons_mean'.  
 Column 'count_compound_chembl_ids_PROTAC' contains the number of compounds found in ChEMBL related to bucket 8.
 

diff --git a/ot_tractability_pipeline_v2/buckets_protac.py b/ot_tractability_pipeline_v2/buckets_protac.py
@@ -956,9 +956,60 @@ def _assign_bucket_7(self):
         self.out_df['Bucket_7_PROTAC'] = 0
         # self.out_df['Bucket_x_PROTAC'] = 0
 
+        # read halflife data from mathieson
         df = pd.read_csv(os.path.join(DATA_PATH, 'protein_half_life_hq.csv'))
 
         df = df.merge(self.out_df, right_on='symbol', left_on='gene_name', how='right')
+
+        # read halflife data from rolfs
+        if os.path.isfile(os.path.join(DATA_PATH, 'protein_half_life_rolfs2021_processed.csv')):
+            df_rolfs = pd.read_csv(os.path.join(DATA_PATH, 'protein_half_life_rolfs2021_processed.csv'))
+        else:
+            dfs_rolfs = []
+            tissue_dict = {
+                'AC': 'Arytenoid Cartilage', 'Blood': 'Blood', 'CC': 'Cricoid Cartilage', 'Liver': 'Liver',
+                'LM': 'Laryngeal Muscle', 'SM': 'Sternocleidomastoid Muscle', 'TC': 'Thyroid Cartilage',
+                'VF': 'Vocal Fold Mucosa'
+            }
+            rolfs_data = os.path.join(DATA_PATH, 'protein_half_life_rolfs2021')
+            for file in os.listdir(rolfs_data):
+                temp = pd.read_csv(os.path.join(rolfs_data, file), sep='\t')
+                temp["Tissue"] = tissue_dict.get(file.partition("_")[0])
+                dfs_rolfs.append(temp)
+            df_rolfs = pd.concat(dfs_rolfs, ignore_index=True)
+            # split Protein (complexes) on ; and duplicate rows
+            df_rolfs = df_rolfs.assign(Protein=df_rolfs['Protein'].str.split(';')).explode('Protein')
+
+            # get mouse mapping file from MGI
+            mouse_map = urllib2.urlopen("https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt").readlines()
+            mouse_map = [x.decode('utf-8').strip('\n').split('\t') for x in mouse_map]
+            mouse_map = pd.DataFrame(mouse_map[1:], columns=mouse_map[0])
+            # get Symbol from mouse mapping
+            df_rolfs = df_rolfs.merge(mouse_map, left_on='Protein', right_on='SWISS_PROT IDs', how='left')
+            df_rolfs['Symbol'] = df_rolfs['Symbol'].str.upper()
+            # aggregate tissues and take min and max respectively
+            df_rolfs = df_rolfs.groupby('Symbol').agg(
+                {'LowerConfidenceInterval': 'min', 'UpperConfidenceInterval': 'max',
+                 'Tissue': lambda x: '|'.join(sorted((set(x))))}
+            )[['LowerConfidenceInterval', 'UpperConfidenceInterval', 'Tissue']].reset_index()
+            # rename columns
+            df_rolfs.rename(
+                columns={'LowerConfidenceInterval': 'Min_halflife', 'UpperConfidenceInterval': 'Max_halflife',
+                         'Tissue': 'Tissue_rolfs'},
+                inplace=True
+            )
+            # convert from days to hours
+            df_rolfs['Min_halflife'] = df_rolfs['Min_halflife'] * 24
+            df_rolfs['Max_halflife'] = df_rolfs['Max_halflife'] * 24
+            df_rolfs.to_csv(os.path.join(DATA_PATH, 'protein_half_life_rolfs2021_processed.csv'), index=False)
+
+        df = df.merge(df_rolfs, left_on='symbol', right_on='Symbol', how='left')
+        # take min and max from both sources and avoid SettingWithCopyWarning
+        df = df.assign(
+            Min_halflife=df[['Min_halflife_x', 'Min_halflife_y']].min(axis=1),
+            Max_halflife=df[['Max_halflife_x', 'Max_halflife_y']].max(axis=1))
+        df.drop(columns=['Min_halflife_x', 'Min_halflife_y', 'Max_halflife_x', 'Max_halflife_y'], inplace=True)
+
         df = df.groupby('ensembl_gene_id', as_index=False).max(numeric_only=True)
         df['Max_halflife'].fillna(-1, inplace=True)
         df['Min_halflife'].fillna(-1, inplace=True)