Source code for sugar._io.meme

# (C) 2024, Tom Eulenfeld, MIT license
"""
Read support for file formats associated with the `MEME`_ Suite
"""


from sugar._io.util import _add_fmt_doc
from sugar.core.fts import Feature, FeatureList


[docs] def is_fts_meme_txt(f, **kw): content = f.read(150) lines = content.splitlines() return '*' in lines[0] and lines[1].startswith('MEME')
[docs] def _read_motifs_meme_txt(f): """ Parse MEME txt file and return dictionary of motifs """ content = f.read() lines = content.splitlines() motif = None motifs = [] important_part = False for line in lines: if line.startswith('MOTIF'): if motif: motifs.append(motif) _, motif_seq, motif_id, *_, evalue = line.split() motif = {'motif': motif_seq, 'motif_id': motif_id, 'evalue': float(evalue), 'hits': []} elif motif: if important_part: if line.strip() == '': important_part = False elif not line.startswith('---') and not line.startswith('Sequence name'): seqid, strand, start, pvalue, *_ = line.split() motif['hits'].append({'seqid': seqid, 'strand': strand, 'start': int(start)-1, 'pvalue': float(pvalue)}) elif 'sites sorted by position p-value' in line: important_part = True assert len(motif['hits']) == 0 if motif: motifs.append(motif) return motifs
#https://regex101.com/r/82v1Zy/2 _MEME_TXT_REGEX = r"\*+\nMOTIF\s+(?P<motif>\w+)\s+(?P<motif_no>[\w-]+).+E-value = (?P<evalue>[\d\.e+-]+)\s*\n\*+\n.*Motif\s+(?P=motif)\s+(?P=motif_no) sites sorted by position p-value\n-+\nSeq[\s\w-]+\n[-\s]+\n(?P<hitdata>.*?)\n-+\n"
[docs] def _read_motifs_meme_txt_regex(f): """ Parse MEME txt file with regex and return dictionary of motifs """ import re content = f.read() motifs = [] for match in re.finditer(_MEME_TXT_REGEX, content, re.DOTALL): motif = { 'motif': match.group('motif'), 'motif_id': match.group('motif_no'), 'evalue': float(match.group('evalue')), 'hits': [] } hitdata = match.group('hitdata') for line in hitdata.splitlines(): seqid, strand, start, pvalue, *_ = line.split() motif['hits'].append({'seqid': seqid, 'strand': strand, 'start': int(start)-1, 'pvalue': float(pvalue)}) motifs.append(motif) return motifs
[docs] @_add_fmt_doc('read_fts') def read_fts_meme_txt(f, *, ftype=None, engine='regex'): """ MEME txt reader :param str ftype: Feature type of returned features :param str engine: Select ``'txt'`` or ``'regex'`` based parsing method, both should give the same result, default is ``'regex'`` Returns a flat `.FeatureList` of all motif hits. To get a dictionary of motifs call `FeatureList.groupby('motif')<.FeatureList.groupby>` or `FeatureList.groupby('motif_id')<.FeatureList.groupby>` on the returned object. Alternatively, use the `_read_motifs_meme_txt_regex` function directly. """ if engine == 'regex': motifs = _read_motifs_meme_txt_regex(f) elif engine == 'txt': motifs = _read_motifs_meme_txt(f) else: raise ValueError("Unknown read engine, must be one of 'regex', 'txt'") fts = [] for motif in motifs: for hit in motif['hits']: ft = Feature(ftype, start=hit['start'], stop=hit['start'] + len(motif['motif']), strand=hit['strand'], meta={ 'seqid': hit['seqid'], 'motif': motif['motif'], 'motif_id': motif['motif_id'], 'evalue': motif['evalue'], '_meme': {'pvalue': hit['pvalue']} }) fts.append(ft) return FeatureList(fts)