import polars as pl
from RNApysoforms.utils import check_df
[docs]
def calculate_exon_number(annotation: pl.DataFrame, transcript_id_column: str = "transcript_id") -> pl.DataFrame:
"""
Assigns exon numbers to exons, CDS, and introns within a genomic annotation dataset based on transcript structure and strand direction.
This function processes a genomic annotation DataFrame to assign exon numbers to genomic features—specifically exons,
CDS (Coding Sequences), and introns—based on their position within each transcript and their strand orientation.
Exons are numbered sequentially within each transcript, accounting for the strand direction: numbering increases from
the 5' to 3' end on the positive strand and decreases on the negative strand. CDS and introns are assigned exon numbers
based on their overlap or adjacency to exons.
**Required Columns in `annotation`:**
- `start`: Start position of the feature.
- `end`: End position of the feature.
- `strand`: Strand direction of the feature (`"+"` or `"-"`).
- `type`: Type of the feature (must include `"exon"`. Can also include `"CDS"`, and/or `"intron"`).
- `transcript_id_column` (default `"transcript_id"`): Identifier for grouping features into transcripts.
Parameters
----------
annotation : pl.DataFrame
A Polars DataFrame containing genomic annotation data. Must include columns for start and end positions,
feature type, strand direction, and a grouping variable (default is 'transcript_id'). If a different
grouping variable is used, specify it using the `transcript_id_column` parameter.
transcript_id_column : str, optional
The column name that identifies transcript groups within the DataFrame, by default "transcript_id".
Returns
-------
pl.DataFrame
A Polars DataFrame that includes all original annotation data along with a new 'exon_number' column.
This column assigns exon numbers to exons, CDS, and introns based on their order and relationships within
each transcript.
Raises
------
TypeError
If the `annotation` parameter is not a Polars DataFrame.
ValueError
If required columns are missing from the `annotation` DataFrame based on the provided parameters.
Examples
--------
Assign exon numbers to genomic features:
>>> import polars as pl
>>> from RNApysoforms import calculate_exon_number
>>> df = pl.DataFrame({
... "transcript_id": ["tx1", "tx1", "tx2", "tx2"],
... "start": [100, 200, 300, 400],
... "end": [150, 250, 350, 450],
... "type": ["exon", "exon", "exon", "exon"],
... "strand": ["+", "+", "-", "-"]
})
>>> result_df = calculate_exon_number(df)
>>> print(result_df)
This will output a DataFrame where exons, CDS, and introns are numbered according to their order and relationships
within each transcript.
Notes
-----
- **Exon Numbering:**
- For transcripts on the positive strand (`"+"`), exons are numbered in ascending order based on their start positions.
- For transcripts on the negative strand (`"-"`), exons are numbered in descending order based on their end positions.
- **CDS Assignment:**
- CDS regions inherit the exon number of the overlapping exon. CDS regions must be within the boundaries of a
single exon, otherwise the function might return erroneous results.
- **Intron Assignment:**
- Introns are assigned the exon number based on the order the introns show up in the transcript.
the first intron will be assigned exon_number 1, so forth and so on.
- **Data Integrity:**
- The function ensures that all required columns are present and correctly formatted before processing.
- If no CDS or introns are present in the data, the function handles these cases gracefully without errors.
- **Performance:**
- Utilizes Polars' efficient data manipulation capabilities to handle large genomic datasets effectively.
"""
# Ensure 'annotation' is a Polars DataFrame
if not isinstance(annotation, pl.DataFrame):
raise TypeError(
f"Expected 'annotation' to be of type pl.DataFrame, got {type(annotation)}. "
"You can convert a pandas DataFrame to Polars using pl.from_pandas(pandas_df)."
)
# Ensure required columns are present in the DataFrame
required_columns = [transcript_id_column, "start", "end", "type", "strand"]
check_df(annotation, required_columns)
# Get original column order
column_order = annotation.columns
column_order = column_order + ["exon_number"]
# Step 1: Extract exons and assign exon numbers based on strand direction
# Filter the annotation DataFrame to include only 'exon' type features
exon_annotation = annotation.filter(pl.col('type') == 'exon')
# Assign exon numbers within each transcript group, accounting for strand direction
exon_annotation = exon_annotation.with_columns(
pl.when(pl.col('strand') == '+')
.then(
# For positive strand, rank exons in ascending order based on 'start' position
pl.col('start').rank('dense').over(transcript_id_column).cast(pl.Int64)
)
.otherwise(
# For negative strand, rank exons in descending order based on 'end' position
pl.col('end').rank('dense', descending=True).over(transcript_id_column).cast(pl.Int64)
)
.alias('exon_number')
)
# Step 2: Assign exon numbers to CDS entries based on overlapping exons
# Filter the annotation DataFrame to include only 'CDS' type features
cds_annotation = annotation.filter(pl.col('type') == 'CDS')
if not cds_annotation.is_empty():
# If there are CDS entries, proceed to assign exon numbers
# Join CDS with exons on the transcript ID to find overlapping exons
cds_exon_annotation = cds_annotation.join(
exon_annotation.select([transcript_id_column, 'start', 'end', 'exon_number']),
on=transcript_id_column,
how='left',
suffix='_exon'
).filter(
# Keep only CDS and exon pairs that overlap
(pl.col('start') <= pl.col('end_exon')) & (pl.col('end') >= pl.col('start_exon'))
)
# For each CDS, assign the minimum exon_number among overlapping exons
cds_exon_annotation = cds_exon_annotation.group_by([transcript_id_column, 'start', 'end', 'strand', 'type']).agg(
pl.col('exon_number').min().alias('exon_number')
)
else:
# If no CDS entries, create an empty DataFrame with 'exon_number' as None
cds_exon_annotation = cds_annotation.with_columns(
pl.lit(None).cast(pl.Int64).alias('exon_number')
)
# Step 3: Assign exon numbers to intron entries based on adjacent exons
# Filter the annotation DataFrame to include only 'intron' type features
intron_annotation = annotation.filter(pl.col('type') == 'intron')
# Initialize a list to collect intron annotations with assigned exon numbers
intron_exon_annotation_list = []
# Process introns separately for positive and negative strands
for strand in ['+', '-']:
# Filter introns and exons by strand
strand_introns = intron_annotation.filter(pl.col('strand') == strand)
strand_exons = exon_annotation.filter(pl.col('strand') == strand)
if not strand_introns.is_empty():
# Join introns with exons on the transcript ID to find adjacent exons
intron_exon_annotation = strand_introns.join(
strand_exons.select([transcript_id_column, 'start', 'end', 'exon_number']),
on=transcript_id_column,
how='left',
suffix='_exon'
)
if strand == '+':
# For positive strand, find exons where exon 'end' is less than or equal to intron 'start'
intron_exon_annotation = intron_exon_annotation.filter(pl.col('end_exon') <= pl.col('start'))
# For each intron, assign the exon_number of the preceding exon (max 'end_exon')
intron_exon_annotation = intron_exon_annotation.group_by([transcript_id_column, 'start', 'end', 'strand', 'type']).agg(
pl.col('exon_number').filter(pl.col('end_exon') == pl.col('end_exon').max()).first().alias('exon_number')
)
else:
# For negative strand, find exons where exon 'start' is greater than or equal to intron 'end'
intron_exon_annotation = intron_exon_annotation.filter(pl.col('start_exon') >= pl.col('end'))
# For each intron, assign the exon_number of the following exon (min 'start_exon')
intron_exon_annotation = intron_exon_annotation.group_by([transcript_id_column, 'start', 'end', 'strand', 'type']).agg(
pl.col('exon_number').filter(pl.col('start_exon') == pl.col('start_exon').min()).first().alias('exon_number')
)
# Add the processed intron annotations to the list
intron_exon_annotation_list.append(intron_exon_annotation)
if not intron_annotation.is_empty():
# Concatenate all intron annotations with assigned exon numbers
intron_exon_annotation = pl.concat(intron_exon_annotation_list)
else:
# If no intron entries, create an empty DataFrame with 'exon_number' as None
intron_exon_annotation = intron_annotation.with_columns(
pl.lit(None).cast(pl.Int64).alias('exon_number')
)
# Combine exons, CDS, and introns with their assigned exon numbers
result_annotation = pl.concat([
exon_annotation.select([transcript_id_column, 'start', 'end', 'strand', 'type', 'exon_number']),
cds_exon_annotation.select([transcript_id_column, 'start', 'end', 'strand', 'type', 'exon_number']),
intron_exon_annotation.select([transcript_id_column, 'start', 'end', 'strand', 'type', 'exon_number'])
])
## Only do this part if annotation is not empty
if not result_annotation.is_empty():
## Return all the original columns
result_annotation = result_annotation.join(annotation, on=required_columns, how="inner")
result_annotation = result_annotation[column_order]
result_annotation = result_annotation.sort([transcript_id_column, 'start'])
return result_annotation # Return the final DataFrame with assigned exon numbers