Source code for sugar._io.clustal

# (C) 2024, Tom Eulenfeld, MIT license
"""
`Clustal`_ IO
"""
from warnings import warn
from sugar import BioBasket, BioSeq, __version__
from sugar._io.util import _add_fmt_doc



[docs]
def is_clustal(f, **kw):
    content = f.read(7)
    return content == 'CLUSTAL'




[docs]
@_add_fmt_doc('read')
def read_clustal(f):
    """
    Read Clustal alignment from file into `.BioBasket`
    """
    seqs = {}
    for line in f:
        line = line.strip()
        if line == '' or line.startswith('CLUSTAL'):
            continue
        parts = line.split()
        if len(parts) >= 2:
            id_, seqdata = parts[:2]
            seqs.setdefault(id_, []).append(seqdata)
        else:
            # do not read conservation line
            # if we do this in the future we cannot rely on split(),
            # because characters at the beginning or end of the conservation string might be whitespaces
            pass
    return BioBasket([BioSeq(''.join(data), id=id_) for id_, data in seqs.items()])



_SEP = 5
_CHARS = 60



[docs]
@_add_fmt_doc('write')
def write_clustal(seqs, f, header_sugar=True, header=None):
    """
    Write `.BioBasket` to Clustal format

    :param bool header_sugar: Append header with sugar version to the first line, default is True
    :param str header: More information appended to the first line
    """
    content = []
    line = 'CLUSTAL'
    if header_sugar:
        line = line + f' format written by sugar v{__version__}'
    if header:
        line = line + ' ' + header
    content = [line + '\n\n']
    idlen = max(len(id_) for id_ in seqs.ids)
    lens = {len(seq) for seq in seqs}
    if len(lens) > 1:
        warn('Writing Clustal file with sequences of different lengths')
    if len(seqs) > 0:
        pos = 0
        while pos < max(lens):
            for seq in seqs:
                content.append(f'{seq.id:<{idlen+_SEP}}{seq.data[pos:pos+_CHARS]}\n')
            content.append('\n')
            pos += _CHARS
    f.write(''.join(content))