Source code for sugar._io.tab.mmseqs
# (C) 2024, Tom Eulenfeld, MIT license
"""
`MMseqs2`_ reader for output generated with option fmtmode 4 (preferred) or 0
"""
from sugar._io.tab.core import read_tabular, _MMSEQS_HEADER_NAMES
from sugar._io.util import _add_fmt_doc
[docs]
def is_fts_mmseqs(f, *, sep='\t', outfmt=None, **kw):
content = f.read(1000)
# read first line
line = content.splitlines()[0]
fields = line.strip().split(sep)
if len(fields) > 0 and set(fields) <= set(_MMSEQS_HEADER_NAMES):
# MMseqs header present, fmtmode 4 was used
return True
# try to read it
fts = read_fts_mmseqs([line], sep=sep, outfmt=outfmt, **kw)
# BLAST and MMseqs2 have a *similar* default output
# (BLAST outfmt 6 vs MMseqs2 fmtmode 0)
# All header fields are the same, except the 3rd field.
# In the 3rd field BLAST uses pident (percentage identity),
# while MMseqs2 uses fident (fraction identity)!
# If a BLAST file is interpreted as written by MMSeqs2,
# the pident field is interpreted as fident resulting in
# pident values above 100%
# (if pident > 1% which is reasonable for most files).
# This is a way to discriminate both file types,
# but we need to check first if a file was written by MMSeqs2,
# and only afterwards if it was written by BLAST.
return len(fts) == 1 and (outfmt is not None or 0 <= fts[0].meta._mmseqs.pident <= 100)
[docs]
@_add_fmt_doc('read_fts')
def read_fts_mmseqs(f, *, sep='\t', outfmt=None, ftype=None, comments=None):
"""
Tabular data reader for feature output generated by MMseqs2
:param str sep: Separator of fields, default ``'\\t'``,
can be set to ``None`` for any whitespace.
:param str outfmt: The string passed to MMseqs2 option ``--format-output``,
can be omitted if MMseqs2 was called with ``--fmtmode 4`` or
``--fmtmode 0`` with default format-output
:param str ftype: Parameter used as ftype
:param list comments: comment lines inside the file are stored in
the comments list (optional)
"""
return read_tabular(f, sep=sep, outfmt=outfmt, ftype=ftype, comments=comments, fmt='mmseqs')