Source code for sugar._io.meme

# (C) 2024, Tom Eulenfeld, MIT license
"""
Read support for file formats associated with the `MEME`_ Suite
"""


from sugar._io.util import _add_fmt_doc
from sugar.core.fts import Feature, FeatureList



[docs]
def is_fts_meme_txt(f, **kw):
    content = f.read(150)
    lines = content.splitlines()
    return '*' in lines[0] and lines[1].startswith('MEME')




[docs]
def _read_motifs_meme_txt(f):
    """
    Parse MEME txt file and return dictionary of motifs
    """
    content = f.read()
    lines = content.splitlines()
    motif = None
    motifs = []
    important_part = False
    for line in lines:
        if line.startswith('MOTIF'):
            if motif:
                motifs.append(motif)
            _, motif_seq, motif_id, *_, evalue = line.split()
            motif = {'motif': motif_seq, 'motif_id': motif_id, 'evalue': float(evalue), 'hits': []}
        elif motif:
            if important_part:
                if line.strip() == '':
                    important_part = False
                elif not line.startswith('---') and not line.startswith('Sequence name'):
                    seqid, strand, start, pvalue, *_ = line.split()
                    motif['hits'].append({'seqid': seqid, 'strand': strand, 'start': int(start)-1, 'pvalue': float(pvalue)})
            elif 'sites sorted by position p-value' in line:
                important_part = True
                assert len(motif['hits']) == 0
    if motif:
        motifs.append(motif)
    return motifs



#https://regex101.com/r/82v1Zy/2
_MEME_TXT_REGEX = r"\*+\nMOTIF\s+(?P<motif>\w+)\s+(?P<motif_no>[\w-]+).+E-value = (?P<evalue>[\d\.e+-]+)\s*\n\*+\n.*Motif\s+(?P=motif)\s+(?P=motif_no) sites sorted by position p-value\n-+\nSeq[\s\w-]+\n[-\s]+\n(?P<hitdata>.*?)\n-+\n"


[docs]
def _read_motifs_meme_txt_regex(f):
    """
    Parse MEME txt file with regex and return dictionary of motifs
    """
    import re
    content = f.read()
    motifs = []
    for match in re.finditer(_MEME_TXT_REGEX, content, re.DOTALL):
        motif = {
            'motif': match.group('motif'),
            'motif_id': match.group('motif_no'),
            'evalue': float(match.group('evalue')),
            'hits': []
        }
        hitdata = match.group('hitdata')
        for line in hitdata.splitlines():
            seqid, strand, start, pvalue, *_ = line.split()
            motif['hits'].append({'seqid': seqid, 'strand': strand, 'start': int(start)-1, 'pvalue': float(pvalue)})
        motifs.append(motif)
    return motifs




[docs]
@_add_fmt_doc('read_fts')
def read_fts_meme_txt(f, *, ftype=None, engine='regex'):
    """
    MEME txt reader

    :param str ftype: Feature type of returned features
    :param str engine: Select ``'txt'`` or ``'regex'`` based parsing method, both should give the same result,
        default is ``'regex'``

    Returns a flat `.FeatureList` of all motif hits.
    To get a dictionary of motifs call `FeatureList.groupby('motif')<.FeatureList.groupby>` or
    `FeatureList.groupby('motif_id')<.FeatureList.groupby>` on the returned object.
    Alternatively, use the `_read_motifs_meme_txt_regex` function directly.
    """
    if engine == 'regex':
        motifs = _read_motifs_meme_txt_regex(f)
    elif engine == 'txt':
        motifs = _read_motifs_meme_txt(f)
    else:
        raise ValueError("Unknown read engine, must be one of 'regex', 'txt'")
    fts = []
    for motif in motifs:
        for hit in motif['hits']:
            ft = Feature(ftype,
                         start=hit['start'], stop=hit['start'] + len(motif['motif']),
                         strand=hit['strand'],
                         meta={
                             'seqid': hit['seqid'],
                             'motif': motif['motif'],
                             'motif_id': motif['motif_id'],
                             'evalue': motif['evalue'],
                             '_meme': {'pvalue': hit['pvalue']}
                         })
            fts.append(ft)
    return FeatureList(fts)