Dihedral Plots: RDKit Mol Object (#243)

* add RDKit Mol object to dihedral analysis plots * add tests, and close #238 * add svgutils and cairosvg methods to plot svg mol object * reimplement DF input option and fix most tests to reflect name changes and altered function definitions * add svgutils and cairosvg to dependencies, install, requirements lists, remove broken test, add reminder to update func list in docs * split plot_violins into new build_svg function * change, better function names for dihedrals workflow module * docs and cleanup, plot width docs, dict comprehension for ab_pairs * intersphinx mapping * tests: new fixtures and tests for bond_indices and ab_pairs * tests: new fixtures and tests for bond_indices and ab_pairs, skip 3.7 * test_build_universe method * confirm build universe test * rewrite docs to cover new functions and kwarg changes * fix tests to accommodate kwarg updates in dihedrals module * explanation of why figdir is a kwarg at top level of dihedrals module but a positional argument elsewhere - workflows base **kwargs, issue #244, see in-line comment in dihedrals.py * temporary fix for figdir issue which should currently be a positional argument, but would require redundant rewrite of workflows base module, pending issue #244 * upcoming CHANGES * remove dafault scope specification for defined functions * reimplement try/except method for rdkit conversion topology element guessing * generate combined plots pdf for automated dihedral analysis * updates for implementation of pypdf in workflows dihedrals module: CHANGES, testing environment, requirements, sphinx source configuration * documentation for dihedral_violins function in workflows dihedrals module * documentation for get_paired_indices function in workflows dihedrals module * documentation and kwarg definition for get_paired_indices function and ab_pairs dictionary object in workflows dihedrals module * kwarg definition for plot_title for dihedral_violins function in workflows dihedrals module * move in-line comments explaining figdir kward for workflows dihedrals module * reorganize kwargs for plot_dihedral_violins in top-level automated_dihedral_analysis function call in workflows dihedrals module * add assert method to make figdir kwarg required in workflows dihedrals module * change MDA guess_atom_element to MDA guess_types for RDKit conversion in workflows dihedrals module * fix registry import error for workflows base, close #245 * remove guess_atom_element import * reimplement assert figdir reuired for workflows dihedrals module * add pypdf to setup.py install_requires for dihedrals workflow * change imports to follow PEP 8 * modify dihedrals workflow docs to explain figdir kwarg requirement * use first solvent specified to build MDAnalysis Universe * modify single solvent plotting method, add solvent count assertion * comment expected fixture scope changes, reference issue #235 * remove solute.unwrap, not needed * reference issue #260 to fix jupyter notebook figure output * finalize single solvent figure modifications and add test --------- Co-authored-by: Oliver Beckstein <[email protected]>
Becksteinlab · Jul 4, 2023 · 76b96d4 · 76b96d4
1 parent 3b93aad
commit 76b96d4
Show file tree

Hide file tree

Showing 9 changed files with 528 additions and 248 deletions.
diff --git a/CHANGES b/CHANGES
@@ -27,6 +27,12 @@ Changes
 
 Enhancements
 
+* convert figure components to SVG, save as individual PDFs,
+  and generate PDF of all figures combined in one file,
+  for workflows dihedrals module (#243)
+* add RDKit mol object to dihedrals plot with dihedral atom
+  indices labeled and dihedral atom group bonds highlighted
+  for workflows dihedrals module (#243)
 * new workflows registry that contains each EnsembleAnalysis for which
   a workflows module exists, for use with workflows base module (#229)
 * new workflows base module that provides iterative workflow use for

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
@@ -18,6 +18,9 @@ dependencies:
 - pymbar >=4
 - rdkit
 - seaborn
+- svgutils
+- cairosvg
+- pypdf
 
   # Testing
 - pytest

diff --git a/doc/requirements.txt b/doc/requirements.txt
@@ -10,3 +10,6 @@ mdanalysis
 rdkit
 seaborn
 matplotlib
+svgutils
+cairosvg
+pypdf
diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
@@ -248,6 +248,9 @@
                        'https://www.rdkit.org/docs/': None,
                        'https://pandas.pydata.org/docs/': None,
                        'https://seaborn.pydata.org': None,
+                       'https://cairosvg.org/documentation/': None,
+                       'https://svgutils.readthedocs.io/en/latest/': None,
+                       'https://pypdf.readthedocs.io/en/stable/': None,
                    }
 
 

diff --git a/mdpow/tests/test_automated_dihedral_analysis.py b/mdpow/tests/test_automated_dihedral_analysis.py
@@ -1,72 +1,87 @@
 import os
 import sys
 import yaml
-import pybol
-import pytest
+import py.path
 import pathlib
 import logging
 
 import scipy
-import numpy as np
-import pandas as pd
-
+from scipy.stats import circvar, circmean
 import seaborn
-
+import numpy as np
 from numpy.testing import assert_almost_equal
-from scipy.stats import circvar, circmean
+import pandas as pd
+import pybol
+import pytest
 
 from . import RESOURCES
-
-import py.path
-
-from mdpow.workflows import dihedrals
-
 from pkg_resources import resource_filename
+from mdpow.workflows import dihedrals
 
 RESOURCES = pathlib.PurePath(resource_filename(__name__, 'testing_resources'))
 MANIFEST = RESOURCES / "manifest.yml"
 
-@pytest.fixture(scope="function")
+resname = "UNK"
+molname = "SM25"
+
+@pytest.fixture
 def molname_workflows_directory(tmp_path, molname='SM25'):
     m = pybol.Manifest(str(MANIFEST))
     m.assemble('workflows', tmp_path)
     return tmp_path / molname
 
 class TestAutomatedDihedralAnalysis(object):
 
-    @pytest.fixture(scope="function")
+    @pytest.fixture
     def SM25_tmp_dir(self, molname_workflows_directory):
         dirname = molname_workflows_directory
         return dirname
 
-    @pytest.fixture(scope="function")
-    def atom_indices(self, SM25_tmp_dir):
-        atom_group_indices = dihedrals.dihedral_indices(dirname=SM25_tmp_dir, resname=self.resname)
+    @pytest.fixture
+    def mol_sol_data(self, SM25_tmp_dir):
+        u = dihedrals.build_universe(dirname=SM25_tmp_dir)
+        mol, solute = dihedrals.rdkit_conversion(u=u, resname=resname)
+        return mol, solute
+
+    @pytest.fixture
+    def atom_indices(self, mol_sol_data):
+        mol, _ = mol_sol_data
+        atom_group_indices = dihedrals.get_atom_indices(mol=mol)
 
         # testing optional user input of alternate SMARTS string
         # for automated dihedral atom group selection
-        atom_group_indices_alt = dihedrals.dihedral_indices(dirname=SM25_tmp_dir,
-                                                            resname=self.resname,
-                                                            SMARTS='[!$(*#*)&!D1]-!@[!$(*#*)&!D1]')
+        atom_group_indices_alt = dihedrals.get_atom_indices(mol=mol, SMARTS='[!$(*#*)&!D1]-!@[!$(*#*)&!D1]')
         return atom_group_indices, atom_group_indices_alt
         # fixture output, tuple:
         # atom_indices[0]=atom_group_indices
         # atom_indices[1]=atom_group_indices_alt
 
-    @pytest.fixture(scope="function")
+    @pytest.fixture
+    def bond_indices(self, mol_sol_data, atom_indices):
+        mol, _ = mol_sol_data
+        atom_index, _ = atom_indices
+        bond_indices = dihedrals.get_bond_indices(mol=mol, atom_indices=atom_index)
+        return bond_indices
+
+    @pytest.fixture
+    def dihedral_groups(self, mol_sol_data, atom_indices):
+        _, solute = mol_sol_data
+        atom_index, _ = atom_indices
+        dihedral_groups = dihedrals.get_dihedral_groups(solute=solute, atom_indices=atom_index)
+        return dihedral_groups
+
+    @pytest.fixture
     def dihedral_data(self, SM25_tmp_dir, atom_indices):
         atom_group_indices, _ = atom_indices
-        df = dihedrals.dihedral_groups_ensemble(atom_group_indices=atom_group_indices,
+        df = dihedrals.dihedral_groups_ensemble(atom_indices=atom_group_indices,
                                                 dirname=SM25_tmp_dir,
                                                 solvents=('water',))
-        df_aug = dihedrals.periodic_angle(df)
+        df_aug = dihedrals.periodic_angle_padding(df)
         return df, df_aug
         # fixture output, tuple:
         # dihedral_data[0]=df
         # dihedral_data[1]=df_aug
 
-    resname = 'UNK'
-
     # tuple-tuples of dihedral atom group indices
     # collected using mdpow.workflows.dihedrals.SMARTS_DEFAULT
     check_atom_group_indices = ((0, 1, 2, 3),(0, 1, 12, 13),(1, 2, 3, 11),(1, 2, 3, 10),
@@ -79,6 +94,23 @@ def dihedral_data(self, SM25_tmp_dir, atom_indices):
     # see: fixture - atom_indices().atom_group_indices_alt
     check_atom_group_indices_alt = ((1, 2), (1, 12), (2, 3), (3, 4), (12, 13), (13, 14))
 
+    check_atom_name_index_pairs = {'O1-C2-N3-S4': (0, 1, 2, 3),
+                                   'O1-C2-C13-C14': (0, 1, 12, 13),
+                                   'C2-N3-S4-O12': (1, 2, 3, 11),
+                                   'C2-N3-S4-O11': (1, 2, 3, 10),
+                                   'C2-N3-S4-C5': (1, 2, 3, 4),
+                                   'C2-C13-C14-C15': (1, 12, 13, 14),
+                                   'N3-S4-C5-C6': (2, 3, 4, 5),
+                                   'N3-S4-C5-C10': (2, 3, 4, 9),
+                                   'N3-C2-C13-C14': (2, 1, 12, 13),
+                                   'S4-N3-C2-C13': (3, 2, 1, 12),
+                                   'C6-C5-S4-O12': (5, 4, 3, 11),
+                                   'C6-C5-S4-O11': (5, 4, 3, 10),
+                                   'C10-C5-S4-O12': (9, 4, 3, 11),
+                                   'C10-C5-S4-O11': (9, 4, 3, 10),
+                                   'C13-C14-C15-C16': (12, 13, 14, 15),
+                                   'C13-C14-C15-C20': (12, 13, 14, 19)}
+
     check_groups = [np.array(['O1', 'C2', 'N3', 'S4'], dtype=object),
                     np.array(['O1', 'C2', 'C13', 'C14'], dtype=object),
                     np.array(['C2', 'N3', 'S4', 'O12'], dtype=object),
@@ -132,29 +164,49 @@ def test_build_universe(self, SM25_tmp_dir):
     # between RDKIT versions; issue raised (#239) to identify and
     # resolve exact package/version responsible
     def test_dihedral_indices(self, atom_indices):
+
         atom_group_indices = atom_indices[0]
         assert set(atom_group_indices) == set(self.check_atom_group_indices)
 
     # Possible ordering issue (#239)
     def test_SMARTS(self, atom_indices):
-        atom_group_indices_alt = atom_indices[1]
+        _, atom_group_indices_alt = atom_indices
         assert atom_group_indices_alt == self.check_atom_group_indices_alt
 
     # Use set comparison because ordering of indices appears to change
     # between RDKIT versions; issue raised (#239) to identify and
     # resolve exact package/version responsible
-    def test_dihedral_groups(self, SM25_tmp_dir):
-        groups = dihedrals.dihedral_groups(dirname=SM25_tmp_dir, resname=self.resname)
+    def test_dihedral_groups(self, dihedral_groups):
+        groups = dihedral_groups
 
         values = [g.all() for g in groups]
         reference = [g.all() for g in self.check_groups]
 
         assert set(values) == set(reference)
 
+    # atom indices are determined by RDKit Mol object
+    # bond indices are determined by atom indices and are subsequently self-consistent
+    # dihedral group names are determined by the MDAnalysis solute object from RDKit-derived atom indices
+    # this test checks if indexing schemes for RDKit and MDAnalysis are consistent
+    def test_RDKit_MDAnalysis_atom_index_consistency(self, atom_indices, bond_indices, dihedral_groups):
+        atom_index, _ = atom_indices
+        bond_index = bond_indices
+        groups = dihedral_groups
+
+        name_index_pairs = dihedrals.get_paired_indices(atom_indices=atom_index, bond_indices=bond_index,
+                                                dihedral_groups=groups)
+
+        atom_name_index_pairs = {}
+
+        for key in name_index_pairs.keys():
+            atom_name_index_pairs[key] = name_index_pairs[key][0]
+
+        assert atom_name_index_pairs == self.check_atom_name_index_pairs
+
     # Possible ordering issue (#239)
     def test_dihedral_groups_ensemble(self, dihedral_data):
 
-        df = dihedral_data[0]
+        df, _ = dihedral_data
 
         dh1_result = df.loc[df['selection'] == 'O1-C2-N3-S4']['dihedral']
         dh1_mean = circmean(dh1_result, high=180, low=-180)
@@ -172,19 +224,21 @@ def test_dihedral_groups_ensemble(self, dihedral_data):
         dh2_var == pytest.approx(self.DG_C13141520_var)
 
     def test_save_df(self, dihedral_data, SM25_tmp_dir):
-        dihedrals.save_df(df=dihedral_data[0], df_save_dir=SM25_tmp_dir, molname='SM25')
+        df, _ = dihedral_data
+        dihedrals.save_df(df=df, df_save_dir=SM25_tmp_dir, resname='UNK', molname='SM25')
         assert (SM25_tmp_dir / 'SM25' / 'SM25_full_df.csv.bz2').exists(), 'Compressed csv file not saved'
 
     def test_save_df_info(self, dihedral_data, SM25_tmp_dir, caplog):
+        df, _ = dihedral_data
         caplog.clear()
         caplog.set_level(logging.INFO, logger='mdpow.workflows.dihedrals')
-        dihedrals.save_df(df=dihedral_data[0], df_save_dir=SM25_tmp_dir, molname='SM25')
+        dihedrals.save_df(df=df, df_save_dir=SM25_tmp_dir, resname='UNK', molname='SM25')
         assert f'Results DataFrame saved as {SM25_tmp_dir}/SM25/SM25_full_df.csv.bz2' in caplog.text, 'Save location not logged or returned'
 
     # Possible ordering issue (#239)
     def test_periodic_angle(self, dihedral_data):
 
-        df_aug = dihedral_data[1]
+        _, df_aug = dihedral_data
 
         aug_dh2_result = df_aug.loc[df_aug['selection'] == 'C13-C14-C15-C20']['dihedral']
 
@@ -195,37 +249,50 @@ def test_periodic_angle(self, dihedral_data):
         aug_dh2_var == pytest.approx(self.ADG_C13141520_var)
 
     # Possible ordering issue (#239)
+    # Tests using similar instances of the automated analyses
+    # will use module or class-scoped fixtures, pending #235
     def test_save_fig(self, SM25_tmp_dir):
         dihedrals.automated_dihedral_analysis(dirname=SM25_tmp_dir, figdir=SM25_tmp_dir,
-                                              resname=self.resname, molname='SM25',
+                                              resname=resname, molname='SM25',
                                               solvents=('water',))
         assert (SM25_tmp_dir / 'SM25' / 'SM25_C10-C5-S4-O11_violins.pdf').exists(), 'PDF file not generated'
 
     # Possible ordering issue (#239)
+    # Tests using similar instances of the automated analyses
+    # will use module or class-scoped fixtures, pending #235
     def test_save_fig_info(self, SM25_tmp_dir, caplog):
         caplog.clear()
         caplog.set_level(logging.INFO, logger='mdpow.workflows.dihedrals')
         dihedrals.automated_dihedral_analysis(dirname=SM25_tmp_dir, figdir=SM25_tmp_dir,
-                                              resname=self.resname, molname='SM25',
+                                              resname=resname, molname='SM25',
                                               solvents=('water',))
         assert f'Figure saved as {SM25_tmp_dir}/SM25/SM25_C10-C5-S4-O11_violins.pdf' in caplog.text, 'PDF file not saved'
 
-    def test_DataFrame_input(self, SM25_tmp_dir):
-        test_df = pd.DataFrame([['C1-C2-C3-C4', 'water', 'Coulomb', 0, 0, 60.0],
-                                ['C1-C2-C3-C5', 'water', 'Coulomb', 0, 0, 60.0]],
-                                [1,2],['selection', 'solvent', 'interaction', 'lambda', 'time', 'dihedral'])
-        plot = dihedrals.automated_dihedral_analysis(dirname=SM25_tmp_dir, figdir=SM25_tmp_dir,
-                                                     resname=self.resname,
-                                                     solvents=('water',), dataframe=test_df)
-        assert isinstance(plot, seaborn.axisgrid.FacetGrid)
+    # Tests using similar instances of the automated analyses
+    # will use module or class-scoped fixtures, pending #235
+    def test_DataFrame_input(self, SM25_tmp_dir, dihedral_data):
+        df, _ = dihedral_data
+        dihedrals.automated_dihedral_analysis(dirname=SM25_tmp_dir, figdir=SM25_tmp_dir,
+                                              resname=resname, molname=molname,
+                                              solvents=('water',), dataframe=df)
+        assert (SM25_tmp_dir / 'SM25' / 'SM25_C10-C5-S4-O11_violins.pdf').exists(), 'PDF file not generated'
 
-    def test_DataFrame_input_info(self, SM25_tmp_dir, caplog):
+    # Tests using similar instances of the automated analyses
+    # will use module or class-scoped fixtures, pending #235
+    def test_DataFrame_input_info(self, SM25_tmp_dir, dihedral_data, caplog):
         caplog.clear()
         caplog.set_level(logging.INFO, logger='mdpow.workflows.dihedrals')
-        test_df = pd.DataFrame([['C1-C2-C3-C4', 'water', 'Coulomb', 0, 0, 60.0],
-                                ['C1-C2-C3-C5', 'water', 'Coulomb', 0, 0, 60.0]],
-                                [1,2],['selection', 'solvent', 'interaction', 'lambda', 'time', 'dihedral'])
+        df, _ = dihedral_data
         dihedrals.automated_dihedral_analysis(dirname=SM25_tmp_dir, figdir=SM25_tmp_dir,
-                                              resname=self.resname,
-                                              solvents=('water',), dataframe=test_df)
+                                              resname=resname, molname=molname,
+                                              solvents=('water',), dataframe=df)
         assert 'Proceeding with results DataFrame provided.' in caplog.text, 'No dataframe provided or dataframe not recognized'
+
+    # testing resources only contain analyses with single solvent input
+    def test_single_solvent(self, dihedral_data):
+        df, _ = dihedral_data
+        # all analysis data in one violin plot
+        g = dihedrals.dihedral_violins(df=df, width=0.9, solvents=('water',), plot_title='test')
+        # number of solvents in DataFrame used to generate plot
+        number_of_solvents = g.data['solvent'].nunique()
+        assert number_of_solvents == 1
diff --git a/mdpow/tests/test_workflows_base.py b/mdpow/tests/test_workflows_base.py
@@ -2,19 +2,16 @@
 import os
 import sys
 import yaml
-import pybol
-import pytest
 import pathlib
 import logging
 
+import pybol
+import pytest
 import pandas as pd
 
-from mdpow.workflows import base
-
-from pkg_resources import resource_filename
-
 from . import RESOURCES, MANIFEST, STATES
-
+from pkg_resources import resource_filename
+from mdpow.workflows import base
 
 @pytest.fixture(scope='function')
 def molname_workflows_directory(tmp_path):
@@ -62,17 +59,23 @@ def test_project_paths_csv_input(self, csv_input_data):
 
         pd.testing.assert_frame_equal(project_paths, csv_df)
 
-    def test_automated_project_analysis(self, project_paths_data, caplog):
+    def test_dihedral_analysis_figdir_requirement(self, project_paths_data, caplog):
+        caplog.clear()
+        caplog.set_level(logging.ERROR, logger='mdpow.workflows.base')
+
         project_paths = project_paths_data
         # change resname to match topology (every SAMPL7 resname is 'UNK')
         # only necessary for this dataset, not necessary for normal use
         project_paths['resname'] = 'UNK'
 
-        base.automated_project_analysis(project_paths, solvents=('water',),
-                                 ensemble_analysis='DihedralAnalysis')
+        with pytest.raises(AssertionError,
+                           match="figdir MUST be set, even though it is a kwarg. Will be changed with #244"):
+
+            base.automated_project_analysis(project_paths, solvents=('water',),
+                                            ensemble_analysis='DihedralAnalysis')
 
-        assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis '
-               'did not iteratively run to completion for the provided project')
+            assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis '
+                   'did not iteratively run to completion for the provided project')
 
     def test_automated_project_analysis_KeyError(self, project_paths_data, caplog):
         caplog.clear()

diff --git a/mdpow/workflows/base.py b/mdpow/workflows/base.py
@@ -22,10 +22,10 @@
 
 import os
 import re
-import pandas as pd
-
 import logging
 
+import pandas as pd
+
 logger = logging.getLogger('mdpow.workflows.base')
 
 def project_paths(parent_directory=None, csv=None, csv_save_dir=None):