Source code for RNApysoforms.to_intron

import polars as pl
from RNApysoforms.utils import check_df



[docs]
def to_intron(annotation: pl.DataFrame, transcript_id_column: str = "transcript_id") -> pl.DataFrame:
    """
    Converts exon coordinates into corresponding intron coordinates within a genomic annotation dataset.

    This function identifies introns by calculating the genomic intervals between consecutive exons for each transcript.
    It returns a DataFrame with the calculated intron coordinates and retains relevant grouping based on the specified
    `transcript_id_column`, typically 'transcript_id'. If intron entries are already present
     in the input data, the function exits with an error.

    Parameters
    ----------
    annotation : pl.DataFrame
        A Polars DataFrame containing genomic annotations, including exon coordinates with the following required columns:
        - `seqnames`: Chromosome or sequence name.
        - `start`: Start position of the exon.
        - `end`: End position of the exon.
        - `type`: Feature type, expected to include "exon".
        - `exon_number`: Numerical identifier for exons.
        - `transcript_id_column`: Column used to group transcripts, typically "transcript_id".
    transcript_id_column : str, optional
        The column used to group data, typically 'transcript_id'. Default is 'transcript_id'.

    Returns
    -------
    pl.DataFrame
        A Polars DataFrame containing both exon and intron coordinates, including other genomic features such as CDS if present.
        The DataFrame includes the following columns:
        - `seqnames`
        - `start`
        - `end`
        - `type` ("exon" or "intron")
        - `exon_number`
        - Additional columns from the input DataFrame.

    Raises
    ------
    TypeError
        If `annotation` is not a Polars DataFrame.
    ValueError
        If the input DataFrame does not contain the required columns (`seqnames`, `start`, `end`, `type`, `exon_number`, and `transcript_id_column`).
        If the input DataFrame already contains introns.

    Examples
    --------
    Convert exons into introns:

    >>> import polars as pl
    >>> from RNApysoforms import to_intron
    >>> df = pl.DataFrame({
    ...    "seqnames": ["chr1", "chr1", "chr1"],
    ...    "start": [100, 200, 300],
    ...    "end": [150, 250, 350],
    ...    "type": ["exon", "exon", "exon"],
    ...    "transcript_id": ["tx1", "tx1", "tx1"],
    ...    "strand": ["+", "+", "+"],
    ...    "exon_number": [1, 2, 3]
    ... })

    >>> df_with_introns = to_intron(df, transcript_id_column="transcript_id")
    >>> print(df_with_introns.head())

    This will return a DataFrame with calculated intron positions between the provided exon coordinates.

    Notes
    -----
    - The function filters out invalid introns where `start` or `end` is null, and introns with length ≤ 1 are discarded.
    - The input DataFrame must contain the required columns listed above.
    - The function can handle input DataFrames with or without existing intron entries. If intron entries are absent, the function generates them.
    - Additional genomic features (e.g., CDS) present in the input DataFrame are retained and merged with intron entries.
    - The function does not adjust intron positions by adding or subtracting 1; intron positions are directly taken from exon boundaries.

    """

    # Check if annotation is a Polars DataFrame
    if not isinstance(annotation, pl.DataFrame):
        raise TypeError(
            f"Expected 'annotation' to be of type pl.DataFrame, got {type(annotation)}."
            "\nYou can convert a pandas DataFrame to Polars using: polars_df = pl.from_pandas(pandas_df)"
        )

    # Validate the input DataFrame to ensure required columns are present
    check_df(annotation, ["seqnames", "start", "end", "type", "exon_number", transcript_id_column])


    ## Make sure annotation has no introns
    if not annotation.filter(pl.col("type") == "intron").is_empty():
        raise ValueError("Your annotation already has introns, please get rid of them before using this function")
    
    ## Make sure annotation has no introns
    if annotation.filter(pl.col("type") == "exon").is_empty():
        raise ValueError("Your annotation must contains exon entries.")
    
    # Separate exons from the annotation data
    exons = annotation.filter(pl.col("type") == "exon").clone()

    # Sort exons by transcript ID and start position
    exons_sorted = exons.sort([transcript_id_column, "start"])

    # Add the end position of the previous exon within each transcript
    exons_with_shift = exons_sorted.with_columns([
        pl.col("end").shift(1).over(transcript_id_column).alias("prev_end"),
        pl.col("exon_number").shift(1).over(transcript_id_column).alias("prev_exon_number"),
    ])

    # Check for overlapping exons
    overlaps = exons_with_shift.filter(
        (pl.col("start") <= pl.col("prev_end")) &
        (pl.col("prev_end").is_not_null())
    ).select([
        transcript_id_column, "exon_number", "start", "end",
        pl.col("prev_exon_number"), pl.col("prev_end")
    ])

    # If overlaps are found, raise an error
    if not overlaps.is_empty():
        raise ValueError(
            f"Your annotation has exons with overlapping coordinates for the same {transcript_id_column} value. "
            "Two exons from the same transcript should not overlap; your annotation may be corrupted. "
            f"Here are the problematic entries:\n{overlaps}"
        )


    # Separate exons and other features (e.g., CDS) from the annotation data
    exons = annotation.filter(pl.col("type") == "exon")
    other_features = annotation.filter(pl.col("type") != "exon")

    # Sort exons by transcript ID and genomic coordinates to ensure correct intron calculation
    sort_columns = [transcript_id_column, 'start', 'end']
    exons_sorted = exons.sort(sort_columns)

    # Calculate intron start and end positions by shifting exon coordinates within each transcript group
    exons_with_introns = exons_sorted.with_columns([
        (pl.col('end').shift(1).over(transcript_id_column) + 1).alias('intron_start'),  # Intron start = end of previous exon + 1 (GTF coordinates)
        (pl.col('start') - 1).alias('intron_end'),                                      # Intron end = start of current exon - 1 (GTF coordinates)
        pl.col("exon_number").shift(1).over(transcript_id_column).alias('intron_number'), ## Get intron number
        pl.lit('intron').alias('type')                                            # Set feature type as 'intron'
    ])

    # Exclude columns that are either renamed or already processed
    exclude_cols = ['start', 'end', 'intron_start', 'intron_end', 'type', 'exon_number']
    columns_to_add = [col for col in exons.columns if col not in exclude_cols]

    # Handle additional columns by taking the first value in each group if transcript_id_column exists
    if transcript_id_column:
        other_cols_expr = [pl.col(col).first().over(transcript_id_column).alias(col) for col in columns_to_add]
    else:
        other_cols_expr = [pl.col(col).first().alias(col) for col in columns_to_add]

    # Select intron columns and include any additional required columns
    introns = exons_with_introns.select([
        pl.col('intron_start').alias('start'),  # Intron start position
        pl.col('intron_end').alias('end'),      # Intron end position
        pl.col("intron_number").alias("exon_number"),                  # Retain exon_number column for reference
        pl.col('type'),                         # Type of feature ('intron')
        *other_cols_expr                        # Include additional columns as necessary
    ])

    # Fix exon number for negative strand introns
    introns = introns.with_columns(
    pl.when(pl.col("strand") == "-")
    .then(pl.col("exon_number") - 1)
    .otherwise(pl.col("exon_number"))
    .alias("exon_number")
    )


    # Remove rows where either 'start' or 'end' is null (invalid introns)
    introns = introns.drop_nulls(subset=['start', 'end'])

    # Filter out introns where the length is 1 or less (invalid introns)
    introns = introns.filter((pl.col('end') - pl.col('start')).abs() > 1)

    # Cast 'start' and 'end' columns to integers for genomic coordinates
    introns = introns.with_columns([
        pl.col('start').cast(pl.Int64),
        pl.col('end').cast(pl.Int64)
    ])

    # Reorder intron columns to match the order of exons for consistency
    introns = introns[exons.columns]

    # Concatenate exons, other features, and introns into a single DataFrame
    combined_annotation = pl.concat([exons, other_features, introns])

    # Sort the combined DataFrame by 'seqnames', transcript_id_column, 'start', 'end', and 'type' for organized output
    combined_annotation = combined_annotation.sort(
        ["seqnames", transcript_id_column, "start", "end", "type"], descending=False
    )

    return combined_annotation  # Return the combined DataFrame with intron entries