Source code for mdtraj.geometry.contact

##############################################################################
# MDTraj: A Python Library for Loading, Saving, and Manipulating
#         Molecular Dynamics Trajectories.
# Copyright 2012-2013 Stanford University and the Authors
#
# Authors: Christian Schwantes
# Contributors: Robert McGibbon
#
# MDTraj is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 2.1
# of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with MDTraj. If not, see <http://www.gnu.org/licenses/>.
##############################################################################


import itertools

import numpy as np

import mdtraj as md
from mdtraj.core import element
from mdtraj.utils import ensure_type

__all__ = ["compute_contacts", "squareform"]


[docs] def compute_contacts( traj, contacts="all", scheme="closest-heavy", ignore_nonprotein=True, periodic=True, soft_min=False, soft_min_beta=20, ): """Compute the distance between pairs of residues in a trajectory. Parameters ---------- traj : md.Trajectory An mdtraj trajectory. It must contain topology information. contacts : array-like, ndim=2 or 'all' An array containing pairs of indices (0-indexed) of residues to compute the contacts between, or 'all'. The string 'all' will select all pairs of residues separated by two or more residues (i.e. the i to i+1 and i to i+2 pairs will be excluded). scheme : {'ca', 'closest', 'closest-heavy', 'sidechain', 'sidechain-heavy'} scheme to determine the distance between two residues: 'ca' : distance between two residues is given by the distance between their alpha carbons 'closest' : distance is the closest distance between any two atoms in the residues 'closest-heavy' : distance is the closest distance between any two non-hydrogen atoms in the residues 'sidechain' : distance is the closest distance between any two atoms in residue sidechains 'sidechain-heavy' : distance is the closest distance between any two non-hydrogen atoms in residue sidechains ignore_nonprotein : bool When using `contact==all`, don't compute contacts between "residues" which are not protein (i.e. do not contain an alpha carbon). periodic : bool, default=True If periodic is True and the trajectory contains unitcell information, we will compute distances under the minimum image convention. soft_min : bool, default=False If soft_min is true, we will use a diffrentiable version of the scheme. The exact expression used is d = \frac{\beta}{log\\sum_i{exp(\frac{\beta}{d_i}})} where beta is user parameter which defaults to 20nm. The expression we use is copied from the plumed mindist calculator. http://plumed.github.io/doc-v2.0/user-doc/html/mindist.html soft_min_beta : float, default=20nm The value of beta to use for the soft_min distance option. Very large values might cause small contact distances to go to 0. Returns ------- distances : np.ndarray, shape=(n_frames, n_pairs), dtype=np.float32 Distances for each residue-residue contact in each frame of the trajectory residue_pairs : np.ndarray, shape=(n_pairs, 2), dtype=int Each row of this return value gives the indices of the residues involved in the contact. This argument mirrors the `contacts` input parameter. When `all` is specified as input, this return value gives the actual residue pairs resolved from `all`. Furthermore, when scheme=='ca', any contact pair supplied as input corresponding to a residue without an alpha carbon (e.g. HOH) is ignored from the input contacts list, meanings that the indexing of the output `distances` may not match up with the indexing of the input `contacts`. But the indexing of `distances` *will* match up with the indexing of `residue_pairs` Examples -------- >>> # To compute the contact distance between residue 0 and 10 and >>> # residues 0 and 11 >>> md.compute_contacts(t, [[0, 10], [0, 11]]) >>> # the itertools library can be useful to generate the arrays of indices >>> group_1 = [0, 1, 2] >>> group_2 = [10, 11] >>> pairs = list(itertools.product(group_1, group_2)) >>> print(pairs) [(0, 10), (0, 11), (1, 10), (1, 11), (2, 10), (2, 11)] >>> md.compute_contacts(t, pairs) See Also -------- mdtraj.geometry.squareform : turn the result from this function into a square "contact map" Topology.residue : Get residues from the topology by index """ if traj.topology is None: raise ValueError("contact calculation requires a topology") if isinstance(contacts, str): if contacts.lower() != "all": raise ValueError( "(%s) is not a valid contacts specifier" % contacts.lower(), ) residue_pairs = [] for i in range(traj.n_residues): residue_i = traj.topology.residue(i) if ignore_nonprotein and not any(a for a in residue_i.atoms if a.name.lower() == "ca"): continue for j in range(i + 3, traj.n_residues): residue_j = traj.topology.residue(j) if ignore_nonprotein and not any(a for a in residue_j.atoms if a.name.lower() == "ca"): continue if residue_i.chain == residue_j.chain: residue_pairs.append((i, j)) residue_pairs = np.array(residue_pairs) if len(residue_pairs) == 0: raise ValueError("No acceptable residue pairs found") else: residue_pairs = ensure_type( np.asarray(contacts), dtype=int, ndim=2, name="contacts", shape=(None, 2), warn_on_cast=False, ) if not np.all((residue_pairs >= 0) * (residue_pairs < traj.n_residues)): raise ValueError( "contacts requests a residue that is not in the permitted range", ) # now the bulk of the function. This will calculate atom distances and then # re-work them in the required scheme to get residue distances scheme = scheme.lower() if scheme not in ["ca", "closest", "closest-heavy", "sidechain", "sidechain-heavy"]: raise ValueError( "scheme must be one of [ca, closest, closest-heavy, sidechain, sidechain-heavy]", ) if scheme == "ca": if soft_min: import warnings warnings.warn( "The soft_min=True option with scheme=ca gives" "the same results as soft_min=False", ) filtered_residue_pairs = [] atom_pairs = [] for r0, r1 in residue_pairs: ca_atoms_0 = [a.index for a in traj.top.residue(r0).atoms if a.name.lower() == "ca"] ca_atoms_1 = [a.index for a in traj.top.residue(r1).atoms if a.name.lower() == "ca"] if len(ca_atoms_0) == 1 and len(ca_atoms_1) == 1: atom_pairs.append((ca_atoms_0[0], ca_atoms_1[0])) filtered_residue_pairs.append((r0, r1)) elif len(ca_atoms_0) == 0 or len(ca_atoms_1) == 0: # residue does not contain a CA atom, skip it if contacts != "all": # if the user manually asked for this residue, and didn't use "all" import warnings warnings.warn( "Ignoring contacts pair %d-%d. No alpha carbon." % (r0, r1), ) else: raise ValueError( "More than 1 alpha carbon detected in residue %d or %d" % (r0, r1), ) residue_pairs = np.array(filtered_residue_pairs) distances = md.compute_distances(traj, atom_pairs, periodic=periodic) elif scheme in ["closest", "closest-heavy", "sidechain", "sidechain-heavy"]: if scheme == "closest": residue_membership = [[atom.index for atom in residue.atoms] for residue in traj.topology.residues] elif scheme == "closest-heavy": # then remove the hydrogens from the above list residue_membership = [ [atom.index for atom in residue.atoms if not (atom.element == element.hydrogen)] for residue in traj.topology.residues ] elif scheme == "sidechain": residue_membership = [ [atom.index for atom in residue.atoms if atom.is_sidechain] for residue in traj.topology.residues ] elif scheme == "sidechain-heavy": # then remove the hydrogens from the above list if "GLY" in [residue.name for residue in traj.topology.residues]: import warnings warnings.warn( "selected topology includes at least one glycine residue, which has no heavy " "atoms in its sidechain. The distances involving glycine residues will be " "computed using the sidechain hydrogen instead.", ) residue_membership = [ ( [ atom.index for atom in residue.atoms if atom.is_sidechain and not (atom.element == element.hydrogen) ] if not residue.name == "GLY" else [atom.index for atom in residue.atoms if atom.is_sidechain] ) for residue in traj.topology.residues ] residue_lens = [len(ainds) for ainds in residue_membership] atom_pairs = [] n_atom_pairs_per_residue_pair = [] for pair in residue_pairs: atom_pairs.extend( list( itertools.product( residue_membership[pair[0]], residue_membership[pair[1]], ), ), ) n_atom_pairs_per_residue_pair.append( residue_lens[pair[0]] * residue_lens[pair[1]], ) atom_distances = md.compute_distances(traj, atom_pairs, periodic=periodic) # now squash the results based on residue membership n_residue_pairs = len(residue_pairs) distances = np.zeros((len(traj), n_residue_pairs), dtype=np.float32) n_atom_pairs_per_residue_pair = np.asarray(n_atom_pairs_per_residue_pair) for i in range(n_residue_pairs): index = int(np.sum(n_atom_pairs_per_residue_pair[:i])) n = n_atom_pairs_per_residue_pair[i] if not soft_min: distances[:, i] = atom_distances[:, index : index + n].min(axis=1) else: distances[:, i] = soft_min_beta / np.log( np.sum( np.exp( soft_min_beta / atom_distances[:, index : index + n], casting='safe', ), axis=1, ), ) else: raise ValueError("This is not supposed to happen!") return distances, residue_pairs
[docs] def squareform(distances, residue_pairs): """Reshape the contact distance to square contact maps Parameters ---------- distances : np.ndarray, shape=(n_frames, n_pairs) Distances between pairs of residues, as computed by `mdtraj.geometry.compute_contacts`. residue_pairs : np.ndarray, shape=(n_pairs, 2) The indices of the residues involved in each pair, as returned by `mdtraj.geometry.compute_contacts` Returns ------- contact_maps : np.ndarray, shape=(n_frames, n_residues, n_residues) Reshaped version of `distances`, such that the distance, in the `k`th frame of the trajectory from residue `i` to residue `j` is given by `contact_maps[k, i, j]`. All entries in `contact_maps` corresponding to the distance between residues that were not part of residue_pairs are 0.0. See Also -------- mdtraj.compute_contacts : Compute the array of contact distances """ if not isinstance(distances, np.ndarray) and distances.ndim == 2: raise ValueError("distances must be a 2d array") residue_pairs = ensure_type( residue_pairs, dtype=int, ndim=2, name="residue_pars", shape=(None, 2), warn_on_cast=False, ) if not np.all(residue_pairs >= 0): raise ValueError( "residue_pairs references a residue that is not in " "the permitted range", ) if distances.shape[1] != residue_pairs.shape[0]: raise ValueError( "The number of pairs in distances, %d, does not " "match the number of pairs in residue_pairs, %d." % (distances.shape[1], residue_pairs.shape[0]), ) n_residues = np.max(residue_pairs) + 1 contact_maps = np.zeros( (distances.shape[0], n_residues, n_residues), dtype=distances.dtype, ) contact_maps[:, residue_pairs[:, 0], residue_pairs[:, 1]] = distances contact_maps[:, residue_pairs[:, 1], residue_pairs[:, 0]] = distances return contact_maps