Source code for mdtraj.core.topology

##############################################################################
# MDTraj: A Python Library for Loading, Saving, and Manipulating
#         Molecular Dynamics Trajectories.
# Copyright 2012-2014 Stanford University and the Authors
#
# Authors: Peter Eastman, Robert McGibbon
# Contributors: Kyle A. Beauchamp, Matthew Harrigan, Carlos Xavier Hernandez
#
# MDTraj is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 2.1
# of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with MDTraj. If not, see <http://www.gnu.org/licenses/>.
#
# Portions of this code originate from the OpenMM molecular simulation
# toolkit, copyright (c) 2012 Stanford University and Peter Eastman. Those
# portions are distributed under the following terms:
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
# USE OR OTHER DEALINGS IN THE SOFTWARE.
##############################################################################


import itertools
import os
import warnings
import xml.etree.ElementTree as etree
from collections import namedtuple

import numpy as np

from mdtraj.core import element as elem
from mdtraj.core.residue_names import (
    _AMINO_ACID_CODES,
    _PROTEIN_RESIDUES,
    _WATER_RESIDUES,
)
from mdtraj.core.selection import parse_selection
from mdtraj.utils import ensure_type, ilen, import_
from mdtraj.utils.singleton import Singleton


def _topology_from_subset(topology, atom_indices):
    """Create a new topology that only contains the supplied indices

    Note
    ----
    This really should be a copy constructor (class method) on Topology.
    It used to work on OpenMM topologies, but we've diverged where that no
    longer works.

    Parameters
    ----------
    topology : mdtraj.Topology
        The base topology
    atom_indices : array_like, dtype=int
        The indices of the atoms to keep
    """
    np_indices = np.array(atom_indices)
    if len(np_indices) > 1:
        if (np_indices[1:] - np_indices[:-1]).min() < 1:
            warnings.warn("atom_indices are not monotonically increasing")
        if len(np.unique(np_indices)) < len(np_indices):
            warnings.warn("atom_indices are not unique")

    newTopology = Topology()
    old_atom_to_new_atom = {}

    for chain in topology._chains:
        newChain = newTopology.add_chain()
        for residue in chain._residues:
            resSeq = getattr(residue, "resSeq", None) or residue.index
            newResidue = None
            for atom in residue._atoms:
                if atom.index in atom_indices:
                    try:  # OpenMM Topology objects don't have serial attributes, so we have to check first.
                        serial = atom.serial
                    except AttributeError:
                        serial = None
                    if newResidue is None:
                        newResidue = newTopology.add_residue(
                            residue.name,
                            newChain,
                            resSeq,
                            residue.segment_id,
                        )
                    newAtom = newTopology.add_atom(
                        atom.name,
                        atom.element,
                        newResidue,
                        serial=serial,
                    )
                    old_atom_to_new_atom[atom] = newAtom

    bondsiter = topology.bonds
    if not hasattr(bondsiter, "__iter__"):
        bondsiter = bondsiter()

    for bond in bondsiter:
        try:
            atom1, atom2 = bond
            newTopology.add_bond(
                old_atom_to_new_atom[atom1],
                old_atom_to_new_atom[atom2],
                type=bond.type,
                order=bond.order,
            )
        except KeyError:
            pass
            # we only put bonds into the new topology if both of their partners
            # were indexed and thus HAVE a new atom

    # Delete empty residues
    newTopology._residues = [r for r in newTopology._residues if len(r._atoms) > 0]
    for chain in newTopology._chains:
        chain._residues = [r for r in chain._residues if len(r._atoms) > 0]

    # Delete empty chains
    newTopology._chains = [c for c in newTopology._chains if len(c._residues) > 0]
    # Re-set the numAtoms and numResidues
    newTopology._numAtoms = ilen(newTopology.atoms)
    newTopology._numResidues = ilen(newTopology.residues)

    # Reset the chain indices
    for i, chain in enumerate(newTopology.chains):
        chain.index = i
    # Reset the residue indices
    for i, res in enumerate(newTopology.residues):
        res.index = i

    return newTopology


[docs] class Topology: """Topology stores the topological information about a system. The structure of a Topology object is similar to that of a PDB file. It consists of a set of Chains (often but not always corresponding to polymer chains). Each Chain contains a set of Residues, and each Residue contains a set of Atoms. In addition, the Topology stores a list of which atom pairs are bonded to each other. Atom and residue names should follow the PDB 3.0 nomenclature for all molecules for which one exists. Attributes ---------- chains : generator Iterator over all Chains in the Topology. residues : generator Iterator over all Residues in the Chain. atoms : generator Iterator over all Atoms in the Chain. bonds : generator Iterator over all Bonds in the Topology Examples -------- >>> topology = md.load('example.pdb').topology >>> print(topology) <mdtraj.Topology with 1 chains, 3 residues, 22 atoms, 21 bonds at 0x105a98e90> >>> table, bonds = topology.to_dataframe() >>> print(table.head()) serial name element resSeq resName chainID 0 0 H1 H 1 CYS 0 1 1 CH3 C 1 CYS 0 2 2 H2 H 1 CYS 0 3 3 H3 H 1 CYS 0 4 4 C C 1 CYS 0 >>> # rename residue "CYS" to "CYSS" >>> table[table['residue'] == 'CYS']['residue'] = 'CYSS' >>> print(table.head()) serial name element resSeq resName chainID 0 0 H1 H 1 CYSS 0 1 1 CH3 C 1 CYSS 0 2 2 H2 H 1 CYSS 0 3 3 H3 H 1 CYSS 0 4 4 C C 1 CYSS 0 >>> t2 = md.Topology.from_dataframe(table, bonds) """ _standardBonds = {}
[docs] def __init__(self): """Create a new Topology object""" self._chains = [] self._numResidues = 0 self._numAtoms = 0 self._bonds = [] self._atoms = [] self._residues = []
def __ne__(self, other): return not self.__eq__(other) def __str__(self): return "<%s>" % (self._string_summary_basic()) def __repr__(self): return f"<{self._string_summary_basic()} at 0x{id(self):02x}>" def _string_summary_basic(self): return "mdtraj.Topology with %d chains, %d residues, " "%d atoms, %d bonds" % ( self.n_chains, self.n_residues, self.n_atoms, len(self._bonds), ) def copy(self): """Return a copy of the topology Returns ------- out : Topology A copy of this topology """ out = Topology() for chain in self.chains: c = out.add_chain() for residue in chain.residues: r = out.add_residue(residue.name, c, residue.resSeq, residue.segment_id) for atom in residue.atoms: out.add_atom(atom.name, atom.element, r, serial=atom.serial) for bond in self.bonds: a1, a2 = bond out.add_bond(a1, a2, type=bond.type, order=bond.order) return out def __copy__(self, *args): return self.copy() def __deepcopy__(self, *args): return self.copy() def __hash__(self): hash_value = hash(tuple(self._chains)) hash_value ^= hash(tuple(self._atoms)) hash_value ^= hash(tuple(self._bonds)) hash_value ^= hash(tuple(self._residues)) return hash_value def join(self, other, keep_resSeq=True): """Join two topologies together Parameters ---------- other : Topology Another topology object keep_resSeq : bool, optional, default=True if False the residue numbers (resSeq) of the topology that is joined are updated in order to continue from the last resSeq of this topology Returns ------- out : Topology A joint topology, with all of the atoms/residues/chains/bonds in each of the individual topologies """ if not isinstance(other, Topology): raise ValueError("other must be an instance of Topology to join") out = self.copy() # I need this in order to have the resSeq of the # new residues to continue from the one of the # las residue of this topology if not keep_resSeq: out_resSeq = out.atom(-1).residue.resSeq atom_mapping = {} for chain in other.chains: c = out.add_chain() for residue in chain.residues: if keep_resSeq: out_resSeq = residue.resSeq else: out_resSeq += 1 r = out.add_residue(residue.name, c, out_resSeq, residue.segment_id) for atom in residue.atoms: a = out.add_atom( atom.name, atom.element, r, serial=atom.serial, ) atom_mapping[atom] = a for bond in other.bonds: a1, a2 = bond out.add_bond( atom_mapping[a1], atom_mapping[a2], type=bond.type, order=bond.order, ) return out def to_fasta(self, chain=None): """Convert this topology into FASTA string Parameters ---------- chain : Integer, optional, default=None If specified, will return the FASTA string for this chain in the Topology. Returns ------- fasta : String or list of Strings A FASTA string for each chain specified. """ def fasta(c): return "".join([res.code for res in c.residues if res.is_protein and res.code is not None]) if chain is not None: if not isinstance(chain, int): raise ValueError("chain must be an Integer.") return fasta(self._chains[chain]) else: return [fasta(c) for c in self._chains] def to_openmm(self, traj=None): """Convert this topology into OpenMM topology Parameters ---------- traj : MDTraj.Trajectory, optional, default=None If specified, use the first frame from this trajectory to set the unitcell information in the openmm topology. Returns ------- topology : openmm.app.Topology This topology, as an OpenMM topology """ app = import_("openmm.app") mm = import_("openmm") u = import_("openmm.unit") out = app.Topology() atom_mapping = {} bond_mapping = { Single: app.Single, Double: app.Double, Triple: app.Triple, Amide: app.Amide, Aromatic: app.Aromatic, None: None, } for chain in self.chains: c = out.addChain(id=chain.chain_id) for residue in chain.residues: r = out.addResidue(residue.name, c, id=str(residue.resSeq)) for atom in residue.atoms: if atom.element is elem.virtual: element = None else: element = app.Element.getBySymbol(atom.element.symbol) # If we are using a compatible version of OpenMM, add formal charge try: a = out.addAtom(atom.name, element, r, formalCharge=atom.formal_charge) except TypeError: # This will occur if the version of OpenMM does not support formal charge (version<=8.2) a = out.addAtom(atom.name, element, r) atom_mapping[atom] = a for bond in self.bonds: a1, a2 = bond out.addBond( atom_mapping[a1], atom_mapping[a2], type=bond_mapping[bond.type], order=bond.order, ) if traj is not None: angles = traj.unitcell_angles[0] if np.linalg.norm(angles - 90.0) > 1e-4: raise ( ValueError( "Unitcell angles must be 90.0 to use " "in OpenMM topology.", ) ) box_vectors = mm.Vec3(*traj.unitcell_lengths[0]) * u.nanometer out.setUnitCellDimensions(box_vectors) return out @classmethod def from_openmm(cls, value): """Create a mdtraj topology from an OpenMM topology Parameters ---------- value : openmm.app.Topology An OpenMM topology that you wish to convert to a mdtraj topology. """ app = import_("openmm.app") bond_mapping = { app.Single: Single, app.Double: Double, app.Triple: Triple, app.Amide: Amide, app.Aromatic: Aromatic, None: None, } if not isinstance(value, app.Topology): raise TypeError( "value must be an OpenMM Topology. " "You supplied a %s" % type(value), ) out = cls() atom_mapping = {} for chain in value.chains(): c = out.add_chain(chain_id=chain.id) for residue in chain.residues(): try: r = out.add_residue(residue.name, c, resSeq=int(residue.id)) except ValueError: r = out.add_residue(residue.name, c) for atom in residue.atoms(): if atom.element is None: element = elem.virtual else: element = elem.get_by_symbol(atom.element.symbol) # If we are using a compatible version of OpenMM (>=8.2), add formal charge if hasattr(atom, "formalCharge"): a = out.add_atom(atom.name, element, r, formal_charge=atom.formalCharge) else: a = out.add_atom(atom.name, element, r) atom_mapping[atom] = a for bond in value.bonds(): a1, a2 = bond out.add_bond( atom_mapping[a1], atom_mapping[a2], type=bond_mapping[bond.type], order=bond.order, ) return out def to_dataframe(self): """Convert this topology into a pandas dataframe Returns ------- atoms : pandas.DataFrame The atoms in the topology, represented as a data frame. bonds : np.ndarray, shape=(n_bonds, 4), dtype=float, Optional The bonds in this topology, represented as an n_bonds x 4 array indicating the two atom indices of the bond, the bond type, and bond order, cast to floats as the type is mapped from the classes to fractional. The atom indices and order are integers cast to float """ pd = import_("pandas") data = [ ( atom.serial, atom.name, atom.element.symbol, atom.residue.resSeq, atom.residue.name, atom.residue.chain.index, atom.segment_id, ) for atom in self.atoms ] atoms = pd.DataFrame( data, columns=[ "serial", "name", "element", "resSeq", "resName", "chainID", "segmentID", ], ) bonds = np.zeros([len(self._bonds), 4], dtype=float) for index, bond in enumerate(self.bonds): if bond.order is None: order = 0.0 else: order = bond.order try: bond_type = float(bond.type) except TypeError: # Trap the None case bond_type = 0.0 bonds[index] = bond.atom1.index, bond.atom2.index, bond_type, order return atoms, bonds @classmethod def from_dataframe(cls, atoms, bonds=None): """Create a mdtraj topology from a pandas data frame Parameters ---------- atoms : pandas.DataFrame The atoms in the topology, represented as a data frame. This data frame should have columns "serial" (atom index), "name" (atom name), "element" (atom's element), "resSeq" (index of the residue) "resName" (name of the residue), "chainID" (index of the chain), and optionally "segmentID", following the same conventions as wwPDB 3.0 format. bonds : np.ndarray, shape=(n_bonds, 4) or (n_bonds, 2), dtype=float, Optional The bonds in the topology, represented as a n_bonds x 4 or n_bonds x 2 size array of the indices of the atoms involved, type, and order of each bond, represented as floats. Type and order are optional. Specifying bonds here is optional. To create standard protein bonds, you can use `create_standard_bonds` to "fill in" the bonds on your newly created Topology object, although type and order of bond are not computed if that method is used. See Also -------- create_standard_bonds """ pd = import_("pandas") if bonds is None: bonds = np.zeros([0, 4], dtype=float) for col in [ "name", "element", "resSeq", "resName", "chainID", "serial", ]: if col not in atoms.columns: raise ValueError("dataframe must have column %s" % col) if "segmentID" not in atoms.columns: atoms["segmentID"] = "" out = cls() if not isinstance(atoms, pd.DataFrame): raise TypeError( "atoms must be an instance of pandas.DataFrame. " "You supplied a %s" % type(atoms), ) if not isinstance(bonds, np.ndarray): raise TypeError( "bonds must be an instance of numpy.ndarray. " "You supplied a %s" % type(bonds), ) if not np.all(np.arange(len(atoms)) == atoms.index): raise ValueError( "atoms must be uniquely numbered " "starting from zero.", ) out._atoms = [None for i in range(len(atoms))] c = None r = None previous_chainID = None previous_resName = None previous_resSeq = None for atom_index, atom in atoms.iterrows(): int(atom_index) # Fixes bizarre hashing issue on Py3K. See #545 if atom["chainID"] != previous_chainID: previous_chainID = atom["chainID"] c = out.add_chain() if atom["resSeq"] != previous_resSeq or atom["resName"] != previous_resName or c.n_atoms == 0: previous_resSeq = atom["resSeq"] previous_resName = atom["resName"] r = out.add_residue( atom["resName"], c, atom["resSeq"], atom["segmentID"], ) a = Atom( atom["name"], elem.get_by_symbol(atom["element"]), atom_index, r, serial=atom["serial"], ) out._atoms[atom_index] = a r._atoms.append(a) for bond in bonds: ai1 = int(bond[0]) ai2 = int(bond[1]) try: bond_type = float_to_bond_type(bond[2]) bond_order = int(bond[3]) if bond_order == 0: bond_order = None except IndexError: # Does not exist bond_type = None bond_order = None out.add_bond(out.atom(ai1), out.atom(ai2), bond_type, bond_order) out._numAtoms = out.n_atoms return out def to_bondgraph(self): """Create a NetworkX graph from the atoms and bonds in this topology Returns ------- g : nx.Graph A graph whose nodes are the Atoms in this topology, and whose edges are the bonds See Also -------- atoms bonds Notes ----- This method requires the NetworkX python package. """ nx = import_("networkx") g = nx.Graph() g.add_nodes_from(self.atoms) g.add_edges_from(self.bonds) return g def __eq__(self, other): """Are two topologies equal? Parameters ---------- other : object The object to compare to Returns ------- equality : bool Are the two topologies identical? """ if not isinstance(other, Topology): return False if self is other: return True if len(self._chains) != len(other._chains): return False for c1, c2 in zip(self.chains, other.chains): if c1.index != c2.index: return False if len(c1._residues) != len(c2._residues): return False for r1, r2 in zip(c1.residues, c2.residues): if (r1.index != r1.index) or (r1.name != r2.name): # or (r1.resSeq != r2.resSeq): return False if len(r1._atoms) != len(r2._atoms): return False for a1, a2 in zip(r1.atoms, r2.atoms): if (a1.index != a2.index) or (a1.name != a2.name): return False if a1.element is not a2.element: return False # for attr in ['atomic_number', 'name', 'symbol']: # if getattr(a1.element, attr) != getattr(a2.element, attr): # return False if len(self._bonds) != len(other._bonds): return False # the bond ordering is somewhat ambiguous, so try and fix it for comparison self_sorted_bonds = sorted([bond for bond in self.bonds]) other_sorted_bonds = sorted([bond for bond in other.bonds]) for i, bond in enumerate(self_sorted_bonds): if bond != other_sorted_bonds[i]: return False return True def add_chain(self, chain_id=None): """Create a new Chain and add it to the Topology. Parameters ---------- chain_id : str The PDB chainID of the Chain. Returns ------- chain : mdtraj.topology.Chain the newly created Chain """ chain = Chain(len(self._chains), self, chain_id) self._chains.append(chain) return chain def add_residue(self, name, chain, resSeq=None, segment_id=""): """Create a new Residue and add it to the Topology. Parameters ---------- name : str The name of the residue to add chain : mdtraj.topology.Chain The Chain to add it to resSeq : int, optional Residue sequence number, such as from a PDB record. These sequence numbers are arbitrary, and do not necessarily start at 0 (or 1). If not supplied, the resSeq attribute will be set to the residue's sequential (0 based) index. segment_id : str, optional A label for the segment to which this residue belongs Returns ------- residue : mdtraj.topology.Residue The newly created Residue """ if resSeq is None: resSeq = self._numResidues residue = Residue(name, self._numResidues, chain, resSeq, segment_id) self._residues.append(residue) self._numResidues += 1 chain._residues.append(residue) return residue def insert_atom(self, name, element, residue, index=None, rindex=None, serial=None): """Create a new Atom and insert it into the Topology at a specific position. Parameters ---------- name : str The name of the atom to add element : mdtraj.element.Element The element of the atom to add residue : mdtraj.topology.Residue The Residue to add it to index : int If provided, the desired index for this atom within the topology. Existing atoms with indices >= index will be pushed back. rindex : int If provided, the desired position for this atom within the residue serial : int Serial number associated with the atom. This has nothing to do with the actual ordering and is solely for PDB labeling purposes. Returns ------- atom : mdtraj.topology.Atom the newly created Atom """ if element is None: element = elem.virtual if index is None: atom = Atom(name, element, self._numAtoms, residue, serial=serial) self._atoms.append(atom) else: atom = Atom(name, element, index, residue, serial=serial) for i in range(index, len(self._atoms)): self._atoms[i].index += 1 self._atoms.insert(index, atom) self._numAtoms += 1 if rindex is None: residue._atoms.append(atom) else: residue._atoms.insert(rindex, atom) return atom def delete_atom_by_index(self, index): """Delete an Atom from the topology. Parameters ---------- index : int The index of the atom to be removed. """ a = self._atoms[index] if a.index != index: raise RuntimeError( "Index of selected atom does not match order in topology.", ) for i in range(index + 1, len(self._atoms)): self._atoms[i].index -= 1 a.residue._atoms.remove(a) self._atoms.remove(a) self._numAtoms -= 1 def add_atom(self, name, element, residue, serial=None, formal_charge=None): """Create a new Atom and add it to the Topology. Parameters ---------- name : str The name of the atom to add element : mdtraj.element.Element The element of the atom to add residue : mdtraj.topology.Residue The Residue to add it to serial : int, optional Serial number associated with the atom. formal_charge : int, optional The formal charge of the atom to add. Returns ------- atom : mdtraj.topology.Atom the newly created Atom """ if element is None: element = elem.virtual atom = Atom(name, element, self._numAtoms, residue, serial=serial, formal_charge=formal_charge) self._atoms.append(atom) self._numAtoms += 1 residue._atoms.append(atom) return atom def add_bond(self, atom1, atom2, type=None, order=None): """Create a new bond and add it to the Topology. Parameters ---------- atom1 : mdtraj.topology.Atom The first Atom connected by the bond atom2 : mdtraj.topology.Atom The second Atom connected by the bond type : mdtraj.topology.Singleton or None, Default: None, Optional Bond type of the bond, or None if not known/provided order : 1, 2, 3 or None, Default: None, Optional Characteristic order of the bond if known, defaults None """ if atom1.index < atom2.index: self._bonds.append(Bond(atom1, atom2, type=type, order=order)) else: self._bonds.append(Bond(atom2, atom1, type=type, order=order)) def chain(self, index): """Get a specific chain by index. These indices start from zero. Parameters ---------- index : int The index of the chain to select. Returns ------- chain : Chain The `index`-th chain in the topology. """ return self._chains[index] @property def chains(self): """Iterator over all Chains in the Topology. Returns ------- chainiter : listiterator Iterator over all Chains in the Topology. """ return iter(self._chains) @property def n_chains(self): """Get the number of chains in the Topology""" return len(self._chains) def residue(self, index): """Get a specific residue by index. These indices start from zero. Parameters ---------- index : int The index of the residue to select. Returns ------- residue : Residue The `index`-th residue in the topology. """ return self._residues[index] @property def residues(self): """Iterator over all Residues in the Topology. Returns ------- residueiter : generator Iterator over all Residues in the Topology. """ for chain in self._chains: yield from chain._residues @property def n_residues(self): """Get the number of residues in the Topology.""" return len(self._residues) def atom(self, index): """Get a specific atom by index. These indices start from zero. Parameters ---------- index : int The index of the atom to select. Returns ------- atom : Atom The `index`-th atom in the topology. """ return self._atoms[index] @property def atoms(self): """Iterator over all Atoms in the Topology. Returns ------- atomiter : generator Iterator over all Atoms in the Topology. """ for chain in self._chains: for residue in chain._residues: yield from residue._atoms def atoms_by_name(self, name): """Iterator over all Atoms in the Topology with a specified name Parameters ---------- name : str The particular atom name of interest. Examples -------- >>> for atom in topology.atoms_by_name('CA'): ... print(atom) Returns ------- atomiter : generator """ for atom in self.atoms: if atom.name == name: yield atom @property def n_atoms(self): """Get the number of atoms in the Topology""" return len(self._atoms) @property def bonds(self): """Iterator over all bonds (each represented as a tuple of two Atoms) in the Topology. Returns ------- bonditer : generator Iterator over all bonds between Atoms in the Trajectory. Each bond can then be iterated to get the atoms. I.e.: `for atom1, atom2 in Topology.bonds` yields iterator over pairs of Atoms `for bond in Topology.bonds` yields iterator over bonds """ return iter(self._bonds) @property def n_bonds(self): """Get the number of bonds in the Topology""" return len(self._bonds) def create_standard_bonds(self): """Create bonds based on the atom and residue names for all standard residue types.""" if len(Topology._standardBonds) == 0: # Load the standard bond defitions. tree = etree.parse( os.path.join( os.path.dirname(__file__), "..", "formats", "pdb", "data", "residues.xml", ), ) for residue in tree.getroot().findall("Residue"): bonds = [] Topology._standardBonds[residue.attrib["name"]] = bonds for bond in residue.findall("Bond"): bonds.append((bond.attrib["from"], bond.attrib["to"])) for chain in self._chains: # First build a map of atom names to atoms. atomMaps = [] for residue in chain._residues: atomMap = {} atomMaps.append(atomMap) for atom in residue._atoms: atomMap[atom.name] = atom # Loop over residues and construct bonds. for i in range(len(chain._residues)): name = chain._residues[i].name if name in Topology._standardBonds: for bond in Topology._standardBonds[name]: if bond[0].startswith("-") and i > 0: fromResidue = i - 1 fromAtom = bond[0][1:] elif bond[0].startswith("+") and i < len(chain._residues): fromResidue = i + 1 fromAtom = bond[0][1:] else: fromResidue = i fromAtom = bond[0] if bond[1].startswith("-") and i > 0: toResidue = i - 1 toAtom = bond[1][1:] elif bond[1].startswith("+") and i < len(chain._residues): toResidue = i + 1 toAtom = bond[1][1:] else: toResidue = i toAtom = bond[1] if fromAtom in atomMaps[fromResidue] and toAtom in atomMaps[toResidue]: self.add_bond( atomMaps[fromResidue][fromAtom], atomMaps[toResidue][toAtom], ) def create_disulfide_bonds(self, positions): """Identify disulfide bonds based on proximity and add them to the Topology. Parameters ---------- positions : list The list of atomic positions based on which to identify bonded atoms """ def isCyx(res): names = [atom.name for atom in res._atoms] return "SG" in names and "HG" not in names cyx = [res for res in self.residues if res.name == "CYS" and isCyx(res)] atomNames = [[atom.name for atom in res._atoms] for res in cyx] for i in range(len(cyx)): sg1 = cyx[i]._atoms[atomNames[i].index("SG")] pos1 = positions[sg1.index] for j in range(i): sg2 = cyx[j]._atoms[atomNames[j].index("SG")] pos2 = positions[sg2.index] delta = [x - y for (x, y) in zip(pos1, pos2)] distance = np.sqrt( delta[0] * delta[0] + delta[1] * delta[1] + delta[2] * delta[2], ) if distance < 0.3: # this is supposed to be nm. I think we're good self.add_bond(sg1, sg2) def subset(self, atom_indices): """Create a new Topology from a subset of the atoms in an existing topology. Notes ----- The existing topology will not be altered. Parameters ---------- atom_indices : array_like A list of the indices corresponding to the atoms in that you'd like to retain. """ return _topology_from_subset(self, atom_indices) def select_expression(self, selection_string): """Translate a atom selection expression into a pure python expression. Parameters ---------- selection_string : str An expression in the MDTraj atom selection DSL Examples -------- >>> topology.select_expression('name O and water') "[atom.index for atom in topology.atoms if ((atom.name == 'O') and atom.residue.is_water)]") Returns ------- python_string : str A string containing a pure python expression, equivalent to the selection expression. """ condition = parse_selection(selection_string).source fmt_string = "[atom.index for atom in topology.atoms if {condition}]" return fmt_string.format(condition=condition) def select(self, selection_string): """Execute a selection against the topology Parameters ---------- selection_string : str An expression in the MDTraj atom selection DSL Examples -------- >>> topology.select('name O and water') array([1, 3, 5, 10, ...]) Returns ------- indices : np.ndarray, dtype=int, ndim=1 Array of the indices of the atoms matching the selection expression. See Also -------- select_expression, mdtraj.core.selection.parse_selection """ filter_func = parse_selection(selection_string).expr indices = np.array([a.index for a in self.atoms if filter_func(a)]) return indices def select_atom_indices(self, selection="minimal"): """Get the indices of biologically-relevant groups by name. Parameters ---------- selection : {'all', 'alpha', 'minimal', 'heavy', 'water'} What types of atoms to select. ``all`` All atoms ``alpha`` Protein residue alpha carbons ``minimal`` Keep the atoms in protein residues with names in {CA, CB, C, N, O} ``heavy`` All non-hydrogen protein atoms. ``water`` Water oxygen atoms Returns ---------- indices : np.ndarray (N,) An array of the indices of the selected atoms. """ selection = selection.lower() options = ["all", "alpha", "minimal", "heavy", "water"] if selection == "all": atom_indices = np.arange(self.n_atoms) elif selection == "alpha": atom_indices = [a.index for a in self.atoms if a.name == "CA" and a.residue.is_protein] elif selection == "minimal": atom_indices = [ a.index for a in self.atoms if a.name in ["CA", "CB", "C", "N", "O"] and a.residue.is_protein ] elif selection == "heavy": atom_indices = [a.index for a in self.atoms if a.element != elem.hydrogen and a.residue.is_protein] elif selection == "water": atom_indices = [a.index for a in self.atoms if a.name in ["O", "OW"] and a.residue.is_water] else: raise ValueError( "{} is not a valid option. Selection must be one of {}".format( selection, ", ".join(options), ), ) indices = np.array(atom_indices) return indices def select_pairs(self, selection1=None, selection2=None): """Generate unique pairs of atom indices. If a selecton is a string, it will be resolved using the atom selection DSL, otherwise it is expected to be an array of atom indices. Parameters ---------- selection1 : str or array-like, shape=(n_indices, ), dtype=int A selection for `select()` or an array of atom indices. selection2 : str or array-like, shape=(n_indices, ), dtype=int A selection for `select()` or an array of atom indices. Returns ------- pairs : array-like, shape=(n_pairs, 2), dtype=int Each row gives the indices of two atoms. """ # Resolve selections using the atom selection DSL... if isinstance(selection1, str): a_indices = self.select(selection1) else: # ...or use a provided array of indices. a_indices = ensure_type( selection1, dtype=np.int32, ndim=1, name="a_indices", warn_on_cast=False, ) if isinstance(selection2, str): b_indices = self.select(selection2) else: b_indices = ensure_type( selection2, dtype=np.int32, ndim=1, name="b_indices", warn_on_cast=False, ) a_indices.sort() b_indices.sort() # Create unique pairs from the indices. # In the cases where a_indices and b_indices are identical or mutually # exclusive, we can utilize a more efficient and memory friendly # approach by removing the intermediate set creation required in # the general case. if np.array_equal(a_indices, b_indices): pairs = self._unique_pairs_equal(a_indices) elif len(np.intersect1d(a_indices, b_indices)) == 0: pairs = self._unique_pairs_mutually_exclusive(a_indices, b_indices) else: pairs = self._unique_pairs(a_indices, b_indices) return pairs @classmethod def _unique_pairs(cls, a_indices, b_indices): return np.array( list( {(a, b) if a > b else (b, a) for a, b in itertools.product(a_indices, b_indices) if a != b}, ), dtype=np.int32, ) @classmethod def _unique_pairs_mutually_exclusive(cls, a_indices, b_indices): pairs = np.fromiter( itertools.chain.from_iterable( itertools.product(a_indices, b_indices), ), dtype=np.int32, count=len(a_indices) * len(b_indices) * 2, ) return np.vstack((pairs[::2], pairs[1::2])).T @classmethod def _unique_pairs_equal(cls, a_indices): pairs = np.fromiter( itertools.chain.from_iterable( itertools.combinations(a_indices, 2), ), dtype=np.int32, count=len(a_indices) * (len(a_indices) - 1), ) return np.vstack((pairs[::2], pairs[1::2])).T def find_molecules(self): """Identify molecules based on bonds. A molecule is defined as a set of atoms that are connected to each other by bonds. This method uses the list of bonds to divide up the Topology's atoms into molecules. Returns ------- molecules : list of sets Each entry represents one molecule, and is the set of all Atoms in that molecule """ if len(self._bonds) == 0 and any(res.n_atoms > 1 for res in self._residues): raise ValueError( "Cannot identify molecules because this Topology does not include bonds", ) # Make a list of every other atom to which each atom is connected. num_atoms = self.n_atoms atom_bonds = [[] for i in range(num_atoms)] for atom1, atom2 in self.bonds: atom_bonds[atom1.index].append(atom2.index) atom_bonds[atom2.index].append(atom1.index) # This is essentially a recursive algorithm, but it is reformulated as a loop to avoid # stack overflows. It selects an atom, marks it as a new molecule, then recursively # marks every atom bonded to it as also being in that molecule. atom_molecule = [-1] * num_atoms num_molecules = 0 for i in range(num_atoms): if atom_molecule[i] == -1: # Start a new molecule. atom_stack = [i] neighbor_stack = [0] molecule = num_molecules num_molecules += 1 # Recursively tag all the bonded atoms. while len(atom_stack) > 0: atom = atom_stack[-1] atom_molecule[atom] = molecule while ( neighbor_stack[-1] < len(atom_bonds[atom]) and atom_molecule[atom_bonds[atom][neighbor_stack[-1]]] != -1 ): neighbor_stack[-1] += 1 if neighbor_stack[-1] < len(atom_bonds[atom]): atom_stack.append(atom_bonds[atom][neighbor_stack[-1]]) neighbor_stack.append(0) else: del atom_stack[-1] del neighbor_stack[-1] # Build the final output. molecules = [set() for i in range(num_molecules)] for atom in self.atoms: molecules[atom_molecule[atom.index]].add(atom) return molecules def guess_anchor_molecules(self): """Guess anchor molecules for imaging Returns ------- anchor_molecules : list of atom sets List of anchor molecules See Also -------- Trajectory.image_molecules """ molecules = self.find_molecules() # Select the anchor molecules. molecules.sort(key=lambda x: -len(x)) atoms_cutoff = max( len(molecules[int(0.1 * len(molecules))]), int(0.1 * len(molecules[0])), ) anchor_molecules = [mol for mol in molecules if len(mol) > atoms_cutoff] num_anchors = len(anchor_molecules) if num_anchors == 0: raise ValueError( "Could not find any anchor molecules. Based on " "our heuristic, those should be molecules with " f"more than {atoms_cutoff} atoms. Perhaps your topology " "doesn't give an accurate bond graph?", ) return anchor_molecules
class Chain: """A Chain object represents a chain within a Topology. Attributes ---------- index : int The index of the Chain within its Topology topology : mdtraj.Topology The Topology this Chain belongs to chain_id : str The PDB chainID of the Chain. residues : generator Iterator over all Residues in the Chain. atoms : generator Iterator over all Atoms in the Chain. """ def __init__(self, index, topology, chain_id=None): """Construct a new Chain. You should call add_chain() on the Topology instead of calling this directly.""" # The index of the Chain within its Topology self.index = index # The Topology this Chain belongs to self.topology = topology self._residues = [] # PDB format chainID self.chain_id = chain_id @property def residues(self): """Iterator over all Residues in the Chain. Returns ------- residueiter : listiterator Iterator over all Residues in the Topology. """ return iter(self._residues) def residue(self, index): """Get a specific residue in this Chain. Parameters ---------- index : int The index of the residue to select. Returns ------- residue : Residue """ return self._residues[index] @property def n_residues(self): """Get the number of residues in this Chain.""" return len(self._residues) @property def atoms(self): """Iterator over all Atoms in the Chain. Returns ------- atomiter : generator Iterator over all Atoms in the Chain. """ for residue in self._residues: yield from residue._atoms def atoms_by_name(self, name): """Iterator over all Atoms in the Chain with a specified name. Parameters ---------- name : str The particular atom name of interest. Examples -------- >>> for atom in chain.atoms_by_name('CA'): ... print(atom) Returns ------- atomiter : generator """ for atom in self.atoms: if atom.name == name: yield atom def atom(self, index): """Get a specific atom in this Chain. Parameters ---------- index : int The index of the atom to select. Returns ------- atom : Atom """ # this could be made faster by caching the list # of atoms internally if necessary return next(itertools.islice(self.atoms, index, index + 1)) @property def n_atoms(self): """Get the number of atoms in this Chain""" return sum(r.n_atoms for r in self._residues) def __hash__(self): return self.index class Residue: """A Residue object represents a residue within a Topology. Attributes ---------- name : str The name of the Residue index : int The index of the Residue within its Topology chain : mdtraj.topology.Chain The chain within which this residue belongs resSeq : int The residue sequence number segment_id : str, optional A label for the segment to which this residue belongs """ def __init__(self, name, index, chain, resSeq, segment_id=""): """Construct a new Residue. You should call add_residue() on the Topology instead of calling this directly.""" self.name = name self.index = index self.chain = chain self.resSeq = resSeq self.segment_id = segment_id self._atoms = [] @property def atoms(self): """Iterator over all Atoms in the Residue. Returns ------- atomiter : listiterator Iterator over all Atoms in the Residue. """ return iter(self._atoms) def atoms_by_name(self, name): """Iterator over all Atoms in the Residue with a specified name Parameters ---------- name : str The particular atom name of interest. Examples -------- >>> for atom in residue.atoms_by_name('CA'): ... print(atom) Returns ------- atomiter : generator """ for atom in self.atoms: if atom.name == name: yield atom def atom(self, index_or_name): """Get a specific atom in this Residue. Parameters ---------- index_or_name : {int, str} Either a (zero-based) index, or the name of the atom. If a string is passed in, the first atom -- in index order -- with a matching name wil be returned. Returns ------- atom : Atom """ try: return self._atoms[index_or_name] except TypeError: try: return next(self.atoms_by_name(index_or_name)) except StopIteration: raise KeyError("no matching atom found") @property def n_atoms(self): """Get the number of atoms in this Residue""" return len(self._atoms) @property def is_protein(self): """Whether the residue is one found in proteins.""" return self.name in _PROTEIN_RESIDUES @property def code(self): """Get the one letter code for this Residue""" if self.is_protein: return _AMINO_ACID_CODES[self.name] else: return None @property def is_water(self): """Whether the residue is water. Residue names according to VMD References ---------- http://www.ks.uiuc.edu/Research/vmd/vmd-1.3/ug/node133.html """ return self.name in _WATER_RESIDUES @property def is_nucleic(self): """Whether the residue is one found in nucleic acids.""" raise NotImplementedError def __str__(self): return f"{self.name}{self.resSeq}" def __repr__(self): return str(self) def __hash__(self): return hash((self.name, self.index, self.resSeq, self.segment_id)) class Atom: """An Atom object represents a residue within a Topology. Attributes ---------- name : str The name of the Atom element : mdtraj.element.Element The element of the Atoms index : int The index of the Atom within its Topology residue : mdtraj.topology.Residue The Residue this Atom belongs to serial : int The serial number from the PDB specification. Unlike index, this may not be contiguous or 0-indexed. formal_charge : float """ def __init__(self, name, element, index, residue, serial=None, formal_charge=None): """Construct a new Atom. You should call add_atom() on the Topology instead of calling this directly.""" # The name of the Atom self.name = name # That Atom's element self.element = element # The index of the Atom within its Topology self.index = index # The Residue this Atom belongs to self.residue = residue # The not-necessarily-contiguous "serial" number from the PDB spec self.serial = serial # The formal charge of the atom self.formal_charge = formal_charge @property def n_bonds(self): """Number of bonds in which the atom participates.""" # TODO: this info could be cached. return ilen(bond for bond in self.residue.chain.topology.bonds if self in bond) @property def is_backbone(self): """Whether the atom is in the backbone of a protein residue""" return self.name in {"C", "CA", "N", "O"} and self.residue.is_protein @property def is_sidechain(self): """Whether the atom is in the sidechain of a protein residue""" return self.name not in {"C", "CA", "N", "O", "HA", "H"} and self.residue.is_protein @property def segment_id(self): """User specified segment_id of the residue to which this atom belongs""" return self.residue.segment_id def __eq__(self, other): """Check whether two Atom objects are equal.""" if self.name != other.name: return False if self.index != other.index: return False if self.element.name != other.element.name: return False if self.residue.name != other.residue.name: return False if self.residue.index != other.residue.index: return False if self.residue.chain.index != other.residue.chain.index: return False return True def __hash__(self): """A quick comparison.""" return self.index def __str__(self): return f"{self.residue}-{self.name}" def __repr__(self): return str(self) # Enumerated values for bond type class Single(Singleton): def __repr__(self): return "Single" def __float__(self): return 1.0 Single = Single() class Double(Singleton): def __repr__(self): return "Double" def __float__(self): return 2.0 Double = Double() class Triple(Singleton): def __repr__(self): return "Triple" def __float__(self): return 3.0 Triple = Triple() class Aromatic(Singleton): def __repr__(self): return "Aromatic" def __float__(self): return 1.5 Aromatic = Aromatic() class Amide(Singleton): def __repr__(self): return "Amide" def __float__(self): return 1.25 Amide = Amide() def float_to_bond_type(bond_float): """ Convert a float to known bond type class, or None if no matched class if found Parameters ---------- bond_float : float Representation of built in bond types as a float, Maps this float to specific bond class, if the float has no map, None is returned instead Returns ------- bond_type : mdtraj.topology.Singleton subclass or None Bond type matched to the float if known If no match is found, returns None (which is also a valid type for the Bond class) """ all_bond_types = [Single, Double, Triple, Aromatic, Amide] for bond_type in all_bond_types: if float(bond_type) == float(bond_float): return bond_type return None class Bond(namedtuple("Bond", ["atom1", "atom2"])): """A Bond representation of a bond between two Atoms within a Topology Attributes ---------- atom1 : mdtraj.topology.Atom The first atom in the bond atom2 : mdtraj.topology.Atom The second atom in the bond order : instance of mdtraj.topology.Singleton or None type : int on [1,3] domain or None """ def __new__(cls, atom1, atom2, type=None, order=None): """Construct a new Bond. You should call add_bond() on the Topology instead of calling this directly. Must use __new__ constructor since this is an immutable class """ bond = super().__new__(cls, atom1, atom2) assert isinstance(type, Singleton) or type is None, "Type must be None or a Singleton" assert order is None or 1 <= order <= 3, "Order must be int between 1 to 3 or None" bond.type = type bond.order = order return bond def __getnewargs__(self): """ Support for pickle protocol 2: http://docs.python.org/2/library/pickle.html#pickling-and-unpickling-normal-class-instances """ return self[0], self[1], self.type, self.order @property def _equality_tuple(self): # Hierarchy of parameters: Atom1 index -> Atom2 index -> type -> order return ( self[0].index, self[1].index, float(self.type) if self.type is not None else 0.0, self.order if self.order is not None else 0, ) def __deepcopy__(self, memo): return Bond(self[0], self[1], self.type, self.order) def __eq__(self, other): if not isinstance(other, Bond): return False return self._equality_tuple == other._equality_tuple def __repr__(self): s = f"Bond({self[0]}, {self[1]}" if self.type is not None: s = f"{s}, type={self.type}" if self.order is not None: s = "%s, order=%d" % (s, self.order) s += ")" return s def __str__(self): return self.__repr__() def __hash__(self): # Set of atoms making up bonds, the type, and the order return hash((self[0], self[1], self.type, self.order)) @staticmethod def _other_is_bond(other): # Ensure other type for inequalities is a bond if not isinstance(other, Bond): raise TypeError("Bond inequalities can only be compared with other bonds") def __gt__(self, other): # Cannot use total_ordering because namedtuple # has its own __gt__, __lt__, etc. methods, which # supersede total_ordering self._other_is_bond(other) return self._equality_tuple > other._equality_tuple def __ge__(self, other): self._other_is_bond(other) return self._equality_tuple >= other._equality_tuple def __lt__(self, other): self._other_is_bond(other) return self._equality_tuple < other._equality_tuple def __le__(self, other): self._other_is_bond(other) return self._equality_tuple <= other._equality_tuple def __ne__(self, other): return not self.__eq__(other) def __getstate__(self): # This is required for pickle because the parent class # does not properly return a state return self.__dict__