Source code for sugar.data

"""
sugar.data -- Use genetic code and substitution matrices

For reference the IUPAC nucleotide code:

.. code-block:: text

    IUPAC nucleotide code 	Base
    A 	Adenine
    C 	Cytosine
    G 	Guanine
    T (or U) 	Thymine (or Uracil)
    R 	A or G
    Y 	C or T
    S 	G or C
    W 	A or T
    K 	G or T
    M 	A or C
    B 	C or G or T
    D 	A or G or T
    H 	A or C or T
    V 	A or C or G
    N 	any base
    . or - 	gap

And the amino acid codes:

.. code-block:: text

    IUPAC amino acid code 	Three letter code 	Amino acid
    A 	Ala 	Alanine
    C 	Cys 	Cysteine
    D 	Asp 	Aspartic Acid
    E 	Glu 	Glutamic Acid
    F 	Phe 	Phenylalanine
    G 	Gly 	Glycine
    H 	His 	Histidine
    I 	Ile 	Isoleucine
    K 	Lys 	Lysine
    L 	Leu 	Leucine
    M 	Met 	Methionine
    N 	Asn 	Asparagine
    P 	Pro 	Proline
    Q 	Gln 	Glutamine
    R 	Arg 	Arginine
    S 	Ser 	Serine
    T 	Thr 	Threonine
    V 	Val 	Valine
    W 	Trp 	Tryptophan
    Y 	Tyr 	Tyrosine
"""

from functools import lru_cache
from importlib.resources import files
import json
from os.path import exists

# IUPAC nucleotid code
CODES = {'A': 'A', 'C': 'C', 'G': 'G', 'T':'T',
         'R': 'AG', 'Y': 'CT', 'S': 'GC', 'W': 'AT', 'K': 'GT', 'M': 'AC',
         'B': 'CGT', 'D': 'AGT', 'H': 'ACT', 'V': 'ACG', 'N': 'ACGT',
         '.': '.', '-': '-'}


def _submat_files():
    return sorted(f.name for f in files('sugar.data.data_submat').iterdir()
                  if not f.name.startswith('README'))

[docs] @lru_cache def submat(fname): """ Return substitution matrix as a dict of dicts >>> from sugar.data import submat >>> bl = submat('blosum62') >>> bl['A']['A'] 4 :param fname: One of the following values: ``{}``. Or use your own file. """ if not exists(fname): fname2 = str(files('sugar.data.data_submat').joinpath(fname.upper())) if not exists(fname2): fnames = ', '.join(_submat_files()) msg = f'No file at {fname} or {fname2}, available matrices: {fnames}' raise FileNotFoundError(msg) fname = fname2 with open(fname) as f: content = f.read() mat = {} letters = None for line in content.splitlines(): if line.strip().startswith('#') or line.strip() == '': continue if letters is None: letters = line.split() else: l1, rest = line.split(maxsplit=1) if l1 not in letters: from warnings import warn warn(f'Letter {l1} not found in table header "{" ".join(letters)}"') vals = map(float if '.' in rest else int, rest.split()) mat[l1] = {l2: v for l2, v in zip(letters, vals)} return mat
[docs] @lru_cache def scale_submat(sm, scale=1): """ Return Scaled substition matrix The matrix values are divided by the sum of all entries and multiplied with the scale factor. :param scale: scale factor .. warning:: It is not clear if this function is useful. It might be removed in a later version of sugar without further notice. """ s = sum(abs(v) for row in sm.values() for v in row.values()) for k1 in sm: for k2 in sm[k1]: sm[k1][k2] = sm[k1][k2] / s * scale return sm
[docs] @lru_cache def gcode(tt=1): """ Return a genetic code object :param tt: number of the translation table (default: 1) >>> from sugar.data import gcode >>> gc = gcode() >>> gc.tt['TAG'] '*' >>> gc.starts # doctest: +SKIP {'ATG', 'CTG', 'TTG'} """ from sugar import Attr fname = files('sugar.data.data_gcode').joinpath('gc.json') with open(fname) as f: gcs = json.load(f) gc = Attr(**gcs[str(tt)]) gc.ttinv = {k: set(v) for k, v in gc.ttinv.items()} gc.starts = set(gc.starts) gc.astarts = set(gc.astarts) gc.stops = set(gc.stops) gc.astops = set(gc.astops) return gc
if hasattr(submat, '__doc__'): submat.__doc__ = submat.__doc__.format(', '.join(_submat_files()))