Source code for sugar._io.clustal

# (C) 2024, Tom Eulenfeld, MIT license
"""
`Clustal`_ IO
"""
from warnings import warn
from sugar import BioBasket, BioSeq, __version__
from sugar._io.util import _add_fmt_doc


[docs] def is_clustal(f, **kw): content = f.read(7) return content == 'CLUSTAL'
[docs] @_add_fmt_doc('read') def read_clustal(f): """ Read Clustal alignment from file into `.BioBasket` """ seqs = {} for line in f: line = line.strip() if line == '' or line.startswith('CLUSTAL'): continue parts = line.split() if len(parts) >= 2: id_, seqdata = parts[:2] seqs.setdefault(id_, []).append(seqdata) else: # do not read conservation line # if we do this in the future we cannot rely on split(), # because characters at the beginning or end of the conservation string might be whitespaces pass return BioBasket([BioSeq(''.join(data), id=id_) for id_, data in seqs.items()])
_SEP = 5 _CHARS = 60
[docs] @_add_fmt_doc('write') def write_clustal(seqs, f, header_sugar=True, header=None): """ Write `.BioBasket` to Clustal format :param bool header_sugar: Append header with sugar version to the first line, default is True :param str header: More information appended to the first line """ content = [] line = 'CLUSTAL' if header_sugar: line = line + f' format written by sugar v{__version__}' if header: line = line + ' ' + header content = [line + '\n\n'] idlen = max(len(id_) for id_ in seqs.ids) lens = {len(seq) for seq in seqs} if len(lens) > 1: warn('Writing Clustal file with sequences of different lengths') if len(seqs) > 0: pos = 0 while pos < max(lens): for seq in seqs: content.append(f'{seq.id:<{idlen+_SEP}}{seq.data[pos:pos+_CHARS]}\n') content.append('\n') pos += _CHARS f.write(''.join(content))