Source code for veoibd_synapse.data.parsers.GTF

#!/usr/bin/env python
"""Provide code to build pyparsing objects that deal with GTF lines."""

# Imports
from collections import namedtuple
import pyparsing as p

from munch import Munch, munchify

# Metadata
__author__ = "Gus Dunn"
__email__ = "w.gus.dunn@gmail.com"



[docs]class GTFLine(object): __slots__ = ["seqname","source","feature","start","end","score","strand","frame","attributes","line_number"] def __init__(self, seqname, source, feature, start, end, score, strand, frame, attributes, line_number=None): self.seqname = seqname self.source = source self.feature = feature self.start = start self.end = end self.score = score self.strand = strand self.frame = frame self.attributes = attributes self.line_number = line_number def __repr__(self): return """GTFLine(seqname="{seqname}", source="{source}", feature="{feature}", start="{start}", end="{end}", score="{score}", strand="{strand}", frame="{frame}", attributes={attributes}, line_number={line_number})""".format(seqname=self.seqname, source=self.source, feature=self.feature, start=self.start, end=self.end, score=self.score, strand=self.strand, frame=self.frame, attributes=self.attributes, line_number=self.line_number)
## Helper parts semcol = p.Literal(";").suppress() tab = p.Literal("\t").suppress() space = p.Literal(" ").suppress() dquot = p.Literal('"').suppress() squot = p.Literal("'").suppress() quote = dquot | squot # Keywords allowed in Attrs # NOTE: this is probably why the parser is GLACIALY slow kws = ["ccdsid", "exon_id", "exon_number", "gene_biotype", "gene_id", "gene_name", "gene_source", "gene_status", "gene_type", "gene_version", "havana_gene", "havana_transcript", "level", "ont", "protein_id", "tag", "transcript_id", "transcript_name", "transcript_status", "transcript_support_level", "transcript_type",] ## Actual parser pieces attr_kws = p.Or([p.Keyword(kw) for kw in kws]) attr_item = attr_kws + p.QuotedString('"')
[docs]def parse_gtf_file(path): """Parse full GTF file by yielding parsed GTF lines. Commented text is ignored. Args: path (Path): Path obj pointing to GTF file. Yields: GTFLine: representing a parsed GTP line. """ with path.open('r') as gtf: line_in_file = 0 for line in gtf: line_in_file += 1 # discard text to the right of comments line = line.strip('\n').split('#')[0] if line: gtf_line = parse_gtf_line(line, line_number=line_in_file) yield gtf_line
# def parse_gtf_line1(line, line_number=None): # """Parse a single line of GTF file into it's columns, converting the attributes into a dict. # # Args: # line (str): One line of GTF formatted information. # line_number (int|None): Optional: number of the line this comes from in the file (starting from 1). # # Returns: # GTFLine: # """ # cols = line.strip('\n').split('\t') # cols[-1] = Munch({x[0][0]:x[0][1] for x in attr_item.scanString(cols[-1])}) # # return GTFLine(*cols,line_number=line_number)
[docs]def parse_gtf_line(line, line_number=None): """Parse a single line of GTF file into it's columns, converting the attributes into a dict. Args: line (str): One line of GTF formatted information. line_number (int|None): Optional: number of the line this comes from in the file (starting from 1). Returns: dict-like """ columns = line.strip('\n').split('\t') required_cols = columns[:-1] attrs_col = columns[-1] attr_strings = (item.strip() for item in attrs_col.strip(';').replace('"','').split(';')) kvs = (attr_string.split() for attr_string in attr_strings) attr_lib = Munch({k:v for k,v in kvs}) attr_lib return GTFLine(*required_cols, attr_lib, line_number=line_number)