Dealing with non-ENSEMBL GTF files

Here is a quick overview on how to deal with non-ENSEMBL GTF files. By no means a comprehensive guide, just some suggestions.

You don’t have to do your data wrangling in Polars. You could always do it using Pandas and then convert it to a Polars dataframe at the end by running: polars_df = pandas_df.from_pandas()

[1]:
import RNApysoforms as RNApy
import polars as pl
[2]:
## Path to your non-ENSEMBL GTF file
alternative_gtf_path = "../../tests/test_data/alternative_gtf_format_chr_21_and_Y.gtf"



# Define the column names for the GTF file
column_names = [
    "seqnames",    # Chromosome or sequence name
    "source",      # Annotation source
    "type",        # Feature type (e.g., exon, CDS)
    "start",       # Start position of the feature
    "end",         # End position of the feature
    "score",       # Score value (usually '.')
    "strand",      # Strand information ('+' or '-')
    "phase",       # Reading frame phase
    "attributes"   # Additional attributes in key-value pairs
]

# Definte types for GTF columns
dtypes = {
    "seqnames": pl.Utf8,
    "source": pl.Utf8,
    "type": pl.Utf8,
    "start": pl.Int64,
    "end": pl.Int64,
    "score": pl.Utf8,
    "strand": pl.Utf8,
    "phase": pl.Utf8,
    "attributes": pl.Utf8
}

# Read the GTF file using Polars
alt_gtf_df = pl.read_csv(
    alternative_gtf_path,
    separator="\t",
    has_header=False,
    comment_prefix="#",       # Skip comment lines starting with '#'
    new_columns=column_names, # Assign column names since GTF files have no header
    schema_overrides=dtypes             # Specify data types for each column
)

[3]:
## Make sure polars print the full column contents
pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000)


## Visualize GTF file format
alt_gtf_df.head()
[3]:
shape: (5, 9)
seqnamessourcetypestartendscorestrandphaseattributes
strstrstri64i64strstrstrstr
"21""Bambu""transcript"50117995017145".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081";"
"21""Bambu""exon"50117995011874".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "1";"
"21""Bambu""exon"50125485012687".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "2";"
"21""Bambu""exon"50143865014471".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "3";"
"21""Bambu""exon"50169355017145".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "4";"
[4]:
## See all possible values for "type column"
alt_gtf_df["type"].unique()
[4]:
shape: (2,)
type
str
"transcript"
"exon"
[5]:
## Filter out to only keep exons
alt_gtf_df = alt_gtf_df.filter(pl.col("type") == "exon")

## Extract the "gene_id", "transcript_id", and "exon_number" from the attributes column and assign them to columns
alt_gtf_df = alt_gtf_df.with_columns([pl.col("attributes").str.extract(r'gene_id "([^"]+)"', 1).alias("gene_id"),
    pl.col("attributes").str.extract(r'transcript_id "([^"]+)"', 1).alias("transcript_id"),
    pl.col("attributes").str.extract(r'exon_number "([^"]+)"', 1).alias("exon_number")])

## Visualize data
alt_gtf_df.head()


[5]:
shape: (5, 12)
seqnamessourcetypestartendscorestrandphaseattributesgene_idtranscript_idexon_number
strstrstri64i64strstrstrstrstrstrstr
"21""Bambu""exon"50117995011874".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "1";""ENSG00000279493""ENST00000624081""1"
"21""Bambu""exon"50125485012687".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "2";""ENSG00000279493""ENST00000624081""2"
"21""Bambu""exon"50143865014471".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "3";""ENSG00000279493""ENST00000624081""3"
"21""Bambu""exon"50169355017145".""+"".""gene_id "ENSG00000279493"; transcript_id "ENST00000624081"; exon_number "4";""ENSG00000279493""ENST00000624081""4"
"21""Bambu""exon"50225315022693".""+"".""gene_id "ENSG00000277117"; transcript_id "ENST00000623960"; exon_number "1";""ENSG00000277117""ENST00000623960""1"
[6]:
# Select the relevant columns and reorder them to make it prettier to look at (this step is optional)
alt_gtf_df = alt_gtf_df.select([
    "gene_id",
    "transcript_id",
    "seqnames",
    "strand",
    "type",
    "start",
    "end",
    "exon_number"])

# Cast 'exon_number' to Int64, handling possible nulls without strict type enforcement (this is mandatory!)
alt_gtf_df = alt_gtf_df.with_columns([pl.col("exon_number").cast(pl.Int64, strict=False)])

"""
Alternatively if your GTF annotation did not provide exon number you could calculate it by running:

`alt_gtf_df = RNApy.calculate_exon_number(alt_gtf_df)`

See RNApysoforms function documentation for more details about the `calculate_exon_number()` function
"""

## Visualize dataframe
alt_gtf_df.head()
[6]:
shape: (5, 8)
gene_idtranscript_idseqnamesstrandtypestartendexon_number
strstrstrstrstri64i64i64
"ENSG00000279493""ENST00000624081""21""+""exon"501179950118741
"ENSG00000279493""ENST00000624081""21""+""exon"501254850126872
"ENSG00000279493""ENST00000624081""21""+""exon"501438650144713
"ENSG00000279493""ENST00000624081""21""+""exon"501693550171454
"ENSG00000277117""ENST00000623960""21""+""exon"502253150226931
[7]:
"""
Proceed to generate figures as usual. Notice that we did not have "transcript_biotype" in the attributes columns,
therefore we have to either pick another column to "hue" our RNA isoforms structure plot with or just
pick a fillcolor for all the RNA isoforms (default is grey)
"""

## Get counts matrix data path and metadata path
counts_matrix_path = "../../tests/test_data/counts_matrix_chr21_and_Y.tsv"
metadata_path = "../../tests/test_data/sample_metadata.tsv"

## Read counts matrix with metadata and normalizations
counts_matrix = RNApy.read_expression_matrix(expression_matrix_path=counts_matrix_path,
                                          metadata_path=metadata_path,
                                           cpm_normalization=True, relative_abundance=True)


"""
Filter annotation by gene_id instead of gene_name by using the `gene_id_column` parameter
since we don't have "gene_name" in the GTF file used.
You can always get creative with "joins" with other GTF annotations
to fill in more information about specific genes and transcripts, just make sure you
do NOT have any null/NA values in your columns. One way to get around that is to
fill NAs in the gene_name column using the gene_id column.
"""
sod1_annotation, sod1_expression_matrix = RNApy.gene_filtering(annotation=alt_gtf_df,
                                     expression_matrix=counts_matrix, order_by_expression_column="counts",
                                     order_by_expression=True, gene_id_column="gene_id", ## This is "gene_name" by default
                                    target_gene="ENSG00000142168" ## SOD1 ensembl gene_id
                                    )

## Rescale introns
sod1_annotation = RNApy.shorten_gaps(sod1_annotation)

"""
Make traces using a constant value for annotation fill color since we don't have
the "transcript_biotype" color for hue. You could alternatively hue the RNA
isoform structure plot by a different annotation column such as
"transcript_id", thus making every transcript a unique color
"""
traces = RNApy.make_traces(annotation=sod1_annotation, expression_matrix=sod1_expression_matrix,
                        x_start="rescaled_start", x_end="rescaled_end",
                         y='transcript_id',
                         annotation_fill_color="red", ## Set annotation fill color to a constant value
                         expression_hue="AD status",
                         hover_start="start", hover_end="end")

## Put traces into figure
fig = RNApy.make_plot(traces = traces, subplot_titles = ["Transcript Structure", "Counts"], width=1200, height=500)

## Show figure
fig.show()

Notes:

You can click on the legend items to make figure elements appear and disappear.

The legend title will get grayed out when clicking on the first legend item. I could not find a workaround for that with the current plotly release (version 5).

The hovering for exons and CDS works best if you hover your mouse over the corners of the CDS/exon boxes.