Annotate RNA modification sites of interest or other single nucleotide sites of interest
Annotate RNA modifications of interest in RNA isoforms or other single nucleotide sites of interest.
This is a “hacky” way to do this, but it is possible.
[1]:
import RNApysoforms as RNApy
import polars as pl
[2]:
## Path to your ENSEMBL GTF file, counts matrix file, and metadata file
ensembl_gtf_path = "../../tests/test_data/Homo_sapiens_chr21_and_Y.GRCh38.110.gtf"
counts_matrix_path = "../../tests/test_data/counts_matrix_chr21_and_Y.tsv"
metadata_path = "../../tests/test_data/sample_metadata.tsv"
## Read ENSEMBL GTF and counts matrix with metadata and normalizations
annotation = RNApy.read_ensembl_gtf(ensembl_gtf_path)
counts_matrix = RNApy.read_expression_matrix(expression_matrix_path=counts_matrix_path,
metadata_path=metadata_path,
cpm_normalization=True, relative_abundance=True)
## Filter APP gene and keep only top 1 expressed transcripts
app_annotation, app_counts_matrix = RNApy.gene_filtering(annotation=annotation, expression_matrix=counts_matrix, target_gene="APP",
order_by_expression=True, keep_top_expressed_transcripts=1,
order_by_expression_column="counts")
[3]:
## Display app annotation for the transcript of interest
app_annotation.head()
[3]:
shape: (5, 11)
| gene_id | gene_name | transcript_id | transcript_name | transcript_biotype | seqnames | strand | type | start | end | exon_number |
|---|---|---|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str | i64 | i64 | i64 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "exon" | 26170564 | 26170767 | 1 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "CDS" | 26170564 | 26170620 | 1 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "exon" | 26111979 | 26112146 | 2 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "CDS" | 26111979 | 26112146 | 2 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "exon" | 26089943 | 26090072 | 3 |
[4]:
"""
Create RNA modification dataframe with 1 nucleotide entries for the RNA modifications of interest.
Notice that the entried must be encoded as type="CDS" and have all other information matching the exon you will be annotating.
Here we made the entries 1 nucleotide long, as one would expect an RNA modification annotation to be.
This could be done in a more systematic way if a list of modification sites is provided, but we leave
that more complex implementation up to future users to develop if interested, we are happy to collaborate in such an
implementation
"""
RNA_modification_data = {
"gene_id": ["ENSG00000142192"] * 2,
"gene_name": ["APP"] * 2,
"transcript_id": ["ENST00000348990"] * 2,
"transcript_name": ["APP-202"] * 2,
"transcript_biotype": ["protein_coding"] * 2,
"seqnames": ["21"] * 2,
"strand": ["-"] * 2,
"type": ["CDS"] * 2,
"start": [26170600, 26112010],
"end": [26170601, 26112011],
"exon_number": [1, 2]
}
# Create the Polars DataFrame
RNA_modification_df = pl.DataFrame(RNA_modification_data)
# Display the RNA modification data
RNA_modification_df.head()
[4]:
shape: (2, 11)
| gene_id | gene_name | transcript_id | transcript_name | transcript_biotype | seqnames | strand | type | start | end | exon_number |
|---|---|---|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str | i64 | i64 | i64 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "CDS" | 26170600 | 26170601 | 1 |
| "ENSG00000142192" | "APP" | "ENST00000348990" | "APP-202" | "protein_coding" | "21" | "-" | "CDS" | 26112010 | 26112011 | 2 |
[5]:
# Filter out rows where type is "CDS" in the original APP annotations
app_annotation = app_annotation.filter(pl.col("type") != "CDS")
# Concatenate the DataFrames vertically to add RNA modification to the APP transcript annotations
app_annotation = pl.concat([app_annotation, RNA_modification_df])
[6]:
# Rescale introns
app_annotation = RNApy.shorten_gaps(app_annotation)
[7]:
## Make CDS type into Inosine so that it is correctly displayed as an RNA modification in the figure
app_annotation = app_annotation.with_columns(
pl.when(pl.col("type") == "CDS")
.then(pl.lit("Inosine"))
.otherwise(pl.col("transcript_biotype"))
.alias("transcript_biotype")
)
[8]:
traces = RNApy.make_traces(annotation=app_annotation,
x_start="rescaled_start", x_end="rescaled_end",
y='transcript_id', annotation_hue="transcript_biotype",
hover_start="start", hover_end="end",
expression_hue="AD status", marker_size=3, arrow_size=7)
## Put traces into a figure
fig = RNApy.make_plot(traces=traces, subplot_titles=["Transcript Structure", "Counts", "CPM", "Relative Abundance"],
width=1200, height=500)
## Show figure
fig.show()