Source code for sugar._io.stockholm

# (C) 2024, Tom Eulenfeld, MIT license
"""
`Stockholm`_ IO
"""

from warnings import warn

from sugar.core.seq import Attr, BioSeq, BioBasket
from sugar._io.util import _add_fmt_doc


filename_extensions_stockholm = ['stk', 'sto', 'stockholm']


[docs] def is_stockholm(f, **kw): content = f.read(11) return content == '# STOCKHOLM'
[docs] def row2fts(row, type_=None, seqid=None, fillchars='-.', splitchar='|'): import re from sugar.core.fts import Defect, Feature, FeatureList, Location i = 0 fts = [] while i < len(row): j = row.find(splitchar, i+1) if j == -1: j = len(row) names = set(re.split(f'[{fillchars}]+', row[i+1:j])) - {''} if len(names) > 0: stop = min(j+1, len(row)) if len(names) > 1: warn('More than one name for feature at location {i}:{stop}, use shortest name') names = sorted(names, key=lambda n: len(n))[:1] name = names.pop() if i == 0 and row[0] != splitchar and j == len(row): assert row[-1] != splitchar defect = Defect.MISS_LEFT | Defect.MISS_RIGHT if i == 0 and row[0] != splitchar: defect = Defect.MISS_LEFT elif j == len(row): assert row[-1] != splitchar defect = Defect.MISS_RIGHT else: defect = Defect.NONE loc = Location(i, stop, defect=defect) ft = Feature(type_, [loc]) ft.meta.name = name if seqid is not None: ft.seqid = seqid fts.append(ft) i = j return FeatureList(fts)
[docs] def fts2row(fts, inchar='-', outchar='.', splitchar='|', lensec=None): openchar = closechar = bothchar = splitchar row = [] last_stop = 0 for ft in fts.sort(): if len(ft.locs) > 1: warn('More than one location in feature, use full loc_range') start, stop = ft.locs.range dif = start - last_stop if dif > 0: row.append(outchar * dif) elif dif == -1: # use the same | for last and this feature assert row[-1][-1] == closechar row[-1] = row[-1][:-1] elif dif < -1: raise ValueError('Features overlap more than one residue') l = stop - start name = '' if ft.name is None else ft.name rowp = name while l > len(name) + len(rowp) + 150: rowp = name + inchar * 100 + rowp rowp = rowp.center(l, inchar) if l < len(name)-2: warn('Feature name too long, shorten it') rowp = inchar + name[:l-2] + inchar if (ft.loc.defect.MISS_LEFT not in ft.loc.defect and ft.loc.defect.BEYOND_LEFT not in ft.loc.defect): rowp = (bothchar if dif == -1 else openchar) + rowp[1:] if (ft.loc.defect.MISS_RIGHT not in ft.locs[-1].defect and ft.loc.defect.BEYOND_RIGHT not in ft.locs[-1].defect): rowp = rowp[:-1] + closechar assert len(rowp) == l row.append(rowp) last_stop = stop return ''.join(row).ljust(lensec or len(row), outchar)
[docs] @_add_fmt_doc('read') def read_stockholm(f, comments=None): """ Read Stockholm file :param list comments: comment lines inside the file are stored in the comments list (optional) .. note:: By default only the first alignment is read and returned. If your file contains more than one alignment, you can read some or all of them with:: from sugar import read with open('example_stockholm_multi.stk') as f: seqs1 = read(f, 'stockholm') # read 1st alignment seqs2 = read(f, 'stockholm') # read 2nd alignment """ seqs = [] gf = Attr() gc = Attr() gs = {} gr = {} seqs = {} for line in f: line = line.strip() if line == '' or line.startswith('# STOCKHOLM'): continue elif line.startswith('#=GF CC written by sugar'): continue elif line.startswith('#=GF'): _, key, val = line.split(maxsplit=2) gf[key] = gf[key] + ' ' + val if key in gf else val elif line.startswith('#=GC'): _, key, val = line.split(maxsplit=2) gc[key] = gc.get(key, '') + val elif line.startswith('#=GS'): _, seqid, key, val = line.split(maxsplit=3) gs.setdefault(seqid, {}) gs[seqid][key] = gs[seqid][key] + ' ' + val if key in gs[seqid] else val elif line.startswith('#=GR'): _, seqid, key, val = line.split(maxsplit=3) gr[seqid][key] = gr.setdefault(seqid, {}).get(key, '') + val elif line.startswith('#'): # ignore other comments if comments is not None: comments.append(line) elif line.startswith('//'): # end of alignment, stop reading break elif ' ' in line: # sequence key, val = line.split(maxsplit=1) seqs[key] = seqs.get(key, '') + val # if seq is not None and seq.id == key: # seq += val # else: # if seq is not None: # seqs.append(seq) # seq = BioSeq(val, id=key) else: # should not happen raise ValueError('Invalid stockholm format, please contact devs') seqs = [BioSeq(val, id=key) for key, val in seqs.items()] for seq in seqs: if seq.id in gs: seq.meta.setdefault('_stockholm', Attr()).GS = gs[seq.id] if seq.id in gr: seq.meta.setdefault('_stockholm', Attr()).GR = gr[seq.id] seqs = BioBasket(seqs) if len(gf) > 0: seqs.meta.setdefault('_stockholm', Attr()).GF = gf if len(gc) > 0: seqs.meta.setdefault('_stockholm', Attr()).GC = gc return seqs
[docs] @_add_fmt_doc('write') def write_stockholm(seqs, f, header_sugar=True): """ Write sequences to Stockholm file :param bool header_sugar: Add a comment to the header with sugar version, default is True """ lines = ['# STOCKHOLM 1.0'] if header_sugar: from sugar import __version__ lines.append(f'#=GF CC written by sugar v{__version__}') try: gf = seqs.meta._stockholm.GF except AttributeError: pass else: for key, value in gf.items(): lines.append(f'#=GF {key} {value}') for seq in seqs: try: gs = seq.meta._stockholm.GS except (AttributeError, KeyError): pass else: for key, value in gs.items(): lines.append(f'#=GS {seq.id} {key} {value}') for seq in seqs: lines.append(f'{seq.id} {seq}') try: gr = seq.meta._stockholm.GR except (AttributeError, KeyError): pass else: for key, value in gr.items(): lines.append(f'#=GR {seq.id} {key} {value}') try: gc = seqs.meta._stockholm.GC except AttributeError: pass else: for key, value in gc.items(): lines.append(f'#=GC {key} {value}') lines.append('//\n') f.write('\n'.join(lines))