Source code for sugar._io.fasta

# (C) 2024, Tom Eulenfeld, MIT license
"""
`FASTA`_ IO
"""
import re

from sugar import BioSeq
from sugar._io.util import _add_fmt_doc


filename_extensions_fasta = ['fasta', 'fa', 'fna', 'faa', 'fas']


[docs]
def is_fasta(f, **kw):
    content = f.read(50)
    return content.strip().startswith('>')



def _create_bioseq(datalines, id_, header):
    data = ''.join(datalines)
    seq = BioSeq(data, id=id_)
    seq.meta._fasta = {}
    seq.meta._fasta.header = header
    return seq


CHS = r'[^,|;\s]'  # allowed characters in the seq ID

# capture some cases from https://de.wikipedia.org/wiki/FASTA-Format
IDPATTERN= (
    rf'[^\s]*gb[:|]({CHS}+)'  # gb:id, gb|id
    rf'|[^\s]*(?:emb|dbj|sp|tr|ref|lcl)[|]({CHS}+)' # xxx|id
    rf'|({CHS}+)'  # "id ", "id;", "id|", "id,"
    )


def _id_from_header(header):
    id_ = None
    if header != '':
        match = re.match(IDPATTERN, header)
        if match is not None:
            for id_ in match.groups():
                if id_ is not None:
                    break
    return id_



[docs]
@_add_fmt_doc('read')
def iter_fasta(f, comments=None):
    """
    Iterate through a FASTA file and yield `.BioSeq` sequences

    :param list comments: comment lines inside the file are stored in
        the comments list (optional)
    """
    id_ = None
    header = None
    data = None
    for line in f:
        if line.startswith('>'):
            if data is not None:
                yield _create_bioseq(data, id_, header)
            line = line.lstrip('>').strip()
            header = line
            id_ = _id_from_header(header)
            # if line == '':
            #     id_ = None
            # elif ' ' not in line and '|' not in line:
            #     id_ = line
            # elif '|' in line:
            #     id_ = line.split('|')[-2]
            # else:
            #     id_ = line.split(maxsplit=1)[0]
            data = []
        elif line.startswith(';'):  # line is a comment
            if comments is not None:
                comments.append(line)
        else:
            line = line.strip()
            if line:
                data.append(line)
    if data is not None:
        yield _create_bioseq(data, id_, header)




[docs]
@_add_fmt_doc('write')
def append_fasta(seq, f):
    """
    Append a `.BioSeq` sequence to a FASTA file
    """
    id_ = seq.id or ''
    if '_fasta' in seq.meta and 'header' in seq.meta._fasta:
        header = (' ' + seq.meta._fasta.header.removeprefix(id_).lstrip()).rstrip()
    else:
        header = ''
    content = f'>{id_}{header}\n{seq.data}\n'
    f.write(content)