Source code for sugar.data

"""
sugar.data -- Use genetic code and substitution matrices

For reference the IUPAC nucleotide code:

.. code-block:: text

    IUPAC nucleotide code 	Base
    A 	Adenine
    C 	Cytosine
    G 	Guanine
    T (or U) 	Thymine (or Uracil)
    R 	A or G
    Y 	C or T
    S 	G or C
    W 	A or T
    K 	G or T
    M 	A or C
    B 	C or G or T
    D 	A or G or T
    H 	A or C or T
    V 	A or C or G
    N 	any base
    . or - 	gap

And the amino acid codes:

.. code-block:: text

    IUPAC amino acid code 	Three letter code 	Amino acid
    A 	Ala 	Alanine
    C 	Cys 	Cysteine
    D 	Asp 	Aspartic Acid
    E 	Glu 	Glutamic Acid
    F 	Phe 	Phenylalanine
    G 	Gly 	Glycine
    H 	His 	Histidine
    I 	Ile 	Isoleucine
    K 	Lys 	Lysine
    L 	Leu 	Leucine
    M 	Met 	Methionine
    N 	Asn 	Asparagine
    P 	Pro 	Proline
    Q 	Gln 	Glutamine
    R 	Arg 	Arginine
    S 	Ser 	Serine
    T 	Thr 	Threonine
    V 	Val 	Valine
    W 	Trp 	Tryptophan
    Y 	Tyr 	Tyrosine
"""

from functools import lru_cache
from importlib.resources import files
import json
from os.path import exists

# IUPAC nucleotid code
CODES = {'A': 'A', 'C': 'C', 'G': 'G', 'T':'T',
         'R': 'AG', 'Y': 'CT', 'S': 'GC', 'W': 'AT', 'K': 'GT', 'M': 'AC',
         'B': 'CGT', 'D': 'AGT', 'H': 'ACT', 'V': 'ACG', 'N': 'ACGT',
         '.': '.', '-': '-'}


def _submat_files():
    return sorted(f.name for f in files('sugar.data.data_submat').iterdir()
                  if not f.name.startswith('README'))


[docs]
@lru_cache
def submat(fname):
    """
    Return substitution matrix as a dict of dicts

    >>> from sugar.data import submat
    >>> bl = submat('blosum62')
    >>> bl['A']['A']
    4

    :param fname: One of the following values: ``{}``. Or use your own file.
    """
    if not exists(fname):
        fname2 = str(files('sugar.data.data_submat').joinpath(fname.upper()))
        if not exists(fname2):
            fnames = ', '.join(_submat_files())
            msg = f'No file at {fname} or {fname2}, available matrices: {fnames}'
            raise FileNotFoundError(msg)
        fname = fname2
    with open(fname) as f:
        content = f.read()
    mat = {}
    letters = None
    for line in content.splitlines():
        if line.strip().startswith('#') or line.strip() == '':
            continue
        if letters is None:
            letters = line.split()
        else:
            l1, rest = line.split(maxsplit=1)
            if l1 not in letters:
                from warnings import warn
                warn(f'Letter {l1} not found in table header "{" ".join(letters)}"')
            vals = map(float if '.' in rest else int, rest.split())
            mat[l1] = {l2: v for l2, v in zip(letters, vals)}
    return mat




[docs]
@lru_cache
def scale_submat(sm, scale=1):
    """
    Return Scaled substition matrix

    The matrix values are divided by the sum of all entries and
    multiplied with the scale factor.

    :param scale: scale factor

    .. warning::
        It is not clear if this function is useful. It might be removed in
        a later version of sugar without further notice.
    """
    s = sum(abs(v) for row in sm.values() for v in row.values())
    for k1 in sm:
        for k2 in sm[k1]:
            sm[k1][k2] = sm[k1][k2] / s * scale
    return sm




[docs]
@lru_cache
def gcode(tt=1):
    """
    Return a genetic code object

    :param tt: number of the translation table (default: 1)

    >>> from sugar.data import gcode
    >>> gc = gcode()
    >>> gc.tt['TAG']
    '*'
    >>> gc.starts  # doctest: +SKIP
    {'ATG', 'CTG', 'TTG'}
    """
    from sugar import Attr
    fname = files('sugar.data.data_gcode').joinpath('gc.json')
    with open(fname) as f:
        gcs = json.load(f)
    gc = Attr(**gcs[str(tt)])
    gc.ttinv = {k: set(v) for k, v in gc.ttinv.items()}
    gc.starts = set(gc.starts)
    gc.astarts = set(gc.astarts)
    gc.stops = set(gc.stops)
    gc.astops = set(gc.astops)
    return gc



if hasattr(submat, '__doc__'):
    submat.__doc__ = submat.__doc__.format(', '.join(_submat_files()))