Annotate RNA modification sites of interest or other single nucleotide sites of interest

Annotate RNA modifications of interest in RNA isoforms or other single nucleotide sites of interest.

This is a “hacky” way to do this, but it is possible.

[1]:

import RNApysoforms as RNApy
import polars as pl

[2]:

## Path to your ENSEMBL GTF file, counts matrix file, and metadata file
ensembl_gtf_path = "../../tests/test_data/Homo_sapiens_chr21_and_Y.GRCh38.110.gtf"
counts_matrix_path = "../../tests/test_data/counts_matrix_chr21_and_Y.tsv"
metadata_path = "../../tests/test_data/sample_metadata.tsv"


## Read ENSEMBL GTF and counts matrix with metadata and normalizations
annotation = RNApy.read_ensembl_gtf(ensembl_gtf_path)
counts_matrix = RNApy.read_expression_matrix(expression_matrix_path=counts_matrix_path,
                                          metadata_path=metadata_path,
                                           cpm_normalization=True, relative_abundance=True)


## Filter APP gene and keep only top 1 expressed transcripts
app_annotation, app_counts_matrix = RNApy.gene_filtering(annotation=annotation, expression_matrix=counts_matrix, target_gene="APP",
                                                        order_by_expression=True, keep_top_expressed_transcripts=1,
                                                        order_by_expression_column="counts")

[3]:

## Display app annotation for the transcript of interest
app_annotation.head()

[3]:

shape: (5, 11)

gene_id	gene_name	transcript_id	transcript_name	transcript_biotype	seqnames	strand	type	start	end	exon_number
str	str	str	str	str	str	str	str	i64	i64	i64
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"exon"	26170564	26170767	1
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"CDS"	26170564	26170620	1
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"exon"	26111979	26112146	2
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"CDS"	26111979	26112146	2
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"exon"	26089943	26090072	3

[4]:

"""
Create RNA modification dataframe with 1 nucleotide entries for the RNA modifications of interest.
Notice that the entried must be encoded as type="CDS" and have all other information matching the exon you will be annotating.
Here we made the entries 1 nucleotide long, as one would expect an RNA modification annotation to be.
This could be done in a more systematic way if a list of modification sites is provided, but we leave
that more complex implementation up to future users to develop if interested, we are happy to collaborate in such an
implementation
"""

RNA_modification_data = {
    "gene_id": ["ENSG00000142192"] * 2,
    "gene_name": ["APP"] * 2,
    "transcript_id": ["ENST00000348990"] * 2,
    "transcript_name": ["APP-202"] * 2,
    "transcript_biotype": ["protein_coding"] * 2,
    "seqnames": ["21"] * 2,
    "strand": ["-"] * 2,
    "type": ["CDS"] * 2,
    "start": [26170600, 26112010],
    "end": [26170601, 26112011],
    "exon_number": [1, 2]
}

# Create the Polars DataFrame
RNA_modification_df = pl.DataFrame(RNA_modification_data)

# Display the RNA modification data
RNA_modification_df.head()

[4]:

shape: (2, 11)

gene_id	gene_name	transcript_id	transcript_name	transcript_biotype	seqnames	strand	type	start	end	exon_number
str	str	str	str	str	str	str	str	i64	i64	i64
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"CDS"	26170600	26170601	1
"ENSG00000142192"	"APP"	"ENST00000348990"	"APP-202"	"protein_coding"	"21"	"-"	"CDS"	26112010	26112011	2

[5]:

# Filter out rows where type is "CDS" in the original APP annotations
app_annotation = app_annotation.filter(pl.col("type") != "CDS")

# Concatenate the DataFrames vertically to add RNA modification to the APP transcript annotations
app_annotation = pl.concat([app_annotation, RNA_modification_df])

[6]:

# Rescale introns
app_annotation = RNApy.shorten_gaps(app_annotation)

[7]:

## Make CDS type into Inosine so that it is correctly displayed as an RNA modification in the figure
app_annotation = app_annotation.with_columns(
    pl.when(pl.col("type") == "CDS")
      .then(pl.lit("Inosine"))
      .otherwise(pl.col("transcript_biotype"))
      .alias("transcript_biotype")
)

[8]:

traces = RNApy.make_traces(annotation=app_annotation,
                        x_start="rescaled_start", x_end="rescaled_end",
                         y='transcript_id', annotation_hue="transcript_biotype",
                         hover_start="start", hover_end="end",
                         expression_hue="AD status", marker_size=3, arrow_size=7)

## Put traces into a figure
fig = RNApy.make_plot(traces=traces, subplot_titles=["Transcript Structure", "Counts", "CPM", "Relative Abundance"],
                   width=1200, height=500)

## Show figure
fig.show()