Skip to content

Commit

Permalink
- Update: medgen2obo.pl: (i) Abstracted adding of classes and their t…
Browse files Browse the repository at this point in the history
…riples as a function, (ii) updated namespacing of classes based on what type of MedGen/UMLS identifier they are.

- Update: Namespaces MedGen, MedGen_UI (removed), MedGenCUI
- Bugfix: SSSOM metadata yaml had a typo preventing conversion
- Bugfix: Makefile: (i) needed to rename a dependency, (ii) needed to run 'analyze' step after 'stage'
- Update: Makefile: Simplified some goals
  • Loading branch information
joeflack4 committed Aug 2, 2023
1 parent f461d52 commit a89f466
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 55 deletions.
4 changes: 2 additions & 2 deletions config/medgen.sssom-metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ curie_map:
HP: http://purl.obolibrary.org/obo/HP_
MESH: http://identifiers.org/mesh/
MONDO: http://purl.obolibrary.org/obo/MONDO_
MedGen: http://purl.obolibrary.org/obo/Medgen_
MedGen_UID: http://purl.obolibrary.org/obo/Medgen_UID_
MedGen: http://purl.obolibrary.org/obo/MedGen_
MedGenCUI: http://purl.obolibrary.org/obo/MedGenCUI_
NCIT: http://purl.obolibrary.org/obo/NCIT_
OMIM: https://omim.org/entry/
Orphanet: http://www.orpha.net/ORDO/Orphanet_
Expand Down
12 changes: 6 additions & 6 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@
# Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
# that part. In order to force re-download, run `make all -B`.
.DEFAULT_GOAL := all
.PHONY: all build stage stage-% release-artefacts analysis-artefacts clean deploy-release
.PHONY: all build stage stage-% analyze clean deploy-release

OBO=http://purl.obolibrary.org/obo
PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
TODAY ?=$(shell date +%Y-%m-%d)
VERSION=v$(TODAY)

all: build stage clean
release-artefacts: $(PRODUCTS) medgen.sssom.tsv
# analysis-artefacts runs more than just this file; that goal creates multiple files
analysis-artefacts: medgen_terms_mapping_status.tsv
build: release-artefacts analysis-artefacts
all: build stage clean analyze
# analyze: runs more than just this file; that goal creates multiple files
analyze: output/medgen_terms_mapping_status.tsv
build: $(PRODUCTS) medgen.sssom.tsv
stage: $(patsubst %, stage-%, $(PRODUCTS))
mv medgen.obo output/release/
mv medgen.sssom.tsv output/release/
Expand Down Expand Up @@ -92,5 +91,6 @@ deploy-release: | output/release/
tmp/input/mondo.sssom.tsv: | tmp/input/
wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@

# creates more than just this file; that goal creates multiple files
output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/
python src/mondo_mapping_status.py
64 changes: 19 additions & 45 deletions src/medgen2obo.pl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/perl
use strict;

# Vars
my %th = ();
my %rh = ();
my %dh = ();
Expand All @@ -10,6 +11,7 @@

our $PATH = "ftp.ncbi.nlm.nih.gov/pub/medgen";

# Execution
open(F,"gzip -dc $PATH/MGCONSO.RRF.gz|") || die;
while(<F>) {
next if m@^#@;
Expand Down Expand Up @@ -82,7 +84,7 @@
chomp;
my ($u,$c) = split(/\t/,$_);
$uh{$c} = $u;
$th{$c}->{xrefs}->{"MedGen_UID:$u"} = 1;
$th{$c}->{xrefs}->{"MedGen:$u"} = 1;
}
close(F);

Expand All @@ -101,52 +103,12 @@
}
print "\n";

my @ids = keys %th;
@ids = sort @ids;
foreach my $id (@ids) {
if ($id =~ /^C\d+/) {
# TODO: repurpose to func (this is instance 1/2)
my $h = $th{$id};
print "[Term]\n";
print "id: UMLS:$id\n";
print "name: $h->{name}\n";
foreach my $x (keys %{$h->{xrefs}}) {
$x =~ s@MSH:@MESH:@;
$x =~ s@NCI:@NCIT:@;
$x =~ s@SNOMEDCT_US:@SCTID:@;
print "xref: $x\n";
}
foreach (keys %{$ssh{$id} || {}}) {
my $ss = mk_subset($_);
print "subset: $ss\n";
}
foreach my $s (@{$h->{synonyms}}) {
my ($str, $x)= @$s;
$str = escq($str);
print "synonym: \"$str\" RELATED [$x]\n";
}
my $trelh = $rh{$id};
foreach my $rel (keys %{$trelh}) {
my $vh = $trelh->{$rel};
foreach my $v (keys %$vh) {
unless ($v eq $id) {
my $tag = "relationship: $rel";
if ($rel eq 'isa') {
$tag = 'is_a:';
}
if ($rel eq 'mapped_to') {
$tag = 'equivalent_to:';
}
print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
}
}
}
print "\n";
}
# TODO: repurpose to func (this is instance 2/2)
sub add_triples {
my ($prefix, $id) = @_;

my $h = $th{$id};
print "[Term]\n";
print "id: MedGen:$id\n";
print "id: $prefix:$id\n";
print "name: $h->{name}\n";
foreach my $x (keys %{$h->{xrefs}}) {
$x =~ s@MSH:@MESH:@;
Expand Down Expand Up @@ -181,6 +143,18 @@
}
print "\n";
}
my @ids = keys %th;
@ids = sort @ids;
foreach my $id (@ids) {
if ($id =~ /^CN\d+/) {
add_triples('MedGenCUI', $id);
} else {
if ($id =~ /^C\d+/) {
add_triples('UMLS', $id);
}
add_triples('MedGen', $id);
}
}

exit 0;

Expand Down
12 changes: 10 additions & 2 deletions src/mondo_mapping_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@
MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv'
MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv'
# MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used.
MEDGEN_PREFIXES = ['Medgen', 'MedGen', 'MEDGEN', 'Medgen_UID', 'MedGen_UID', 'UMLS', 'UMLS_CUI']
# todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest
MEDGEN_PREFIXES = [
'Medgen', 'MedGen', 'MEDGEN', 'MedGenCUI', 'UMLS', 'UMLS_CUI',
# 'Medgen_UID', 'MedGen_UID', 'Medgen_CUI', 'MedGen_CUI', 'Medgen_CUI'
]
CURIE = str

# TODO: Mappings can be considered skos:exactMatch

def ids_prefixless(ids: Set[str]) -> Set[str]:
"""Remove prefix"""
Expand Down Expand Up @@ -92,16 +97,19 @@ def medgen_mondo_mapping_status(mondo_predicate_filter: List[str] = None):
file_suffix = '' if not mondo_predicate_filter \
else '-mondo-exacts-only' if mondo_predicate_filter == ['skos:exactMatch'] \
else '-custom'

# Read sources
medgen_all_ids, medgen_in_medgen, medgen_in_mondo = \
read_mapping_sources(mondo_predicate_filter=mondo_predicate_filter)

# Special operations
# - Inconsistent prefixes between what Mondo used before and will going forward. In this case, stripping prefixes
# should be OK, at least for now.
medgen_all_ids = ids_prefixless(medgen_all_ids)
medgen_in_medgen = ids_prefixless(medgen_in_medgen)
medgen_in_mondo = ids_prefixless(medgen_in_mondo)
# Report

# Generate reports
report_obs_medgen_in_mondo(medgen_in_mondo, medgen_in_medgen)
report_existing_overlap(medgen_all_ids, medgen_in_medgen, medgen_in_mondo, file_suffix)

Expand Down

0 comments on commit a89f466

Please sign in to comment.