Source code for RNApysoforms.read_expression_matrix

import polars as pl
from typing import Optional, List
import warnings
import os


[docs]
def process_expression_matrix(
    expression_df: pl.DataFrame,
    metadata_df: Optional[pl.DataFrame] = None,
    expression_measure_name: str = "counts",
    cpm_normalization: bool = False,
    relative_abundance: bool = False,
    gene_id_column_name: Optional[str] = "gene_id",
    transcript_id_column_name: str = "transcript_id",
    metadata_sample_id_column: str = "sample_id"
) -> pl.DataFrame:
    """
    Processes an expression matrix DataFrame, optionally merging with metadata, performing CPM normalization, and calculating relative transcript abundance.

    This function takes an already-loaded Polars DataFrame containing expression data and processes it by:
    - Validating required columns
    - Optionally calculating Counts Per Million (CPM) normalization
    - Optionally calculating relative transcript abundance based on gene counts
    - Converting from wide to long format
    - Optionally merging with metadata

    This function is useful when you have already loaded your expression data into a Polars DataFrame and want to process it
    without reading from a file. If you need to read from a file, use :func:`read_expression_matrix` instead.

    **Expression DataFrame Format Requirements:**
    - Must be in wide format with samples as columns and transcripts as rows.
    - Must contain the column specified by `transcript_id_column_name` (default "transcript_id").
    - If `gene_id_column_name` is provided and not None, that column must be present.
    - All other columns are assumed to be sample expression values and must contain numeric values.

    Example expression DataFrame format::

        transcript_id    gene_id    sample1    sample2    sample3
        ENST0001         ENSG001    100        200        150
        ENST0002         ENSG001    50         75         60
        ENST0003         ENSG002    300        250        275

    **Metadata DataFrame Format Requirements (if provided):**
    - Must contain the column specified by `metadata_sample_id_column` (default "sample_id").
    - Values in this column must exactly match the sample column names in the expression DataFrame.
    - Can contain any number of additional metadata columns.

    Example metadata DataFrame format::

        sample_id    condition    batch
        sample1      control      1
        sample2      treated      1
        sample3      control      2

    Parameters
    ----------
    expression_df : pl.DataFrame
        A Polars DataFrame in wide format containing expression data. Must contain transcript_id column and sample expression columns.
    metadata_df : pl.DataFrame, optional
        A Polars DataFrame containing metadata. If provided, will be merged with the expression data on the specified sample identifier column.
        Default is None.
    expression_measure_name : str, optional
        The name to assign to the expression measure column after melting. This will be the name of the column containing the expression values 
        in the long-format DataFrame. Default is `"counts"`.
    cpm_normalization : bool, optional
        Whether to perform Counts Per Million (CPM) normalization on the expression data. If True, CPM values will be calculated for each sample. 
        Default is False.
    relative_abundance : bool, optional
        Whether to calculate relative transcript abundance based on gene counts. Requires `gene_id_column_name` to be provided and not None. 
        Default is False.
    gene_id_column_name : str, optional
        The name of the column in the expression DataFrame that contains gene identifiers. This column will remain fixed during data transformation.
        If provided and `relative_abundance` is True, relative transcript abundance will be calculated. Default is `"gene_id"`. If set to None, 
        the gene identifier will not be used.
    transcript_id_column_name : str
        The name of the column in the expression DataFrame that contains transcript identifiers. This parameter is required and cannot be None. 
        Default is `"transcript_id"`.
    metadata_sample_id_column : str, optional
        Column name in the metadata DataFrame that identifies samples. This column is used to merge the metadata and expression data. 
        Also used as the variable name when melting the expression data. Default is `"sample_id"`.

    Returns
    -------
    pl.DataFrame
        A Polars DataFrame in long format containing the expression data, and optionally CPM values, relative abundances, and metadata.

    Raises
    ------
    ValueError
        If `transcript_id_column_name` is None.
        If required feature ID columns are missing in the expression DataFrame.
        If the expression columns are not numeric.
        If required columns are missing in the metadata DataFrame.
        If there are no overlapping sample IDs between expression data and metadata.

    Warns
    -----
    UserWarning
        If `relative_abundance` is True but `gene_id_column_name` is None.
        If there is partial overlap of sample IDs between expression data and metadata.

    Examples
    --------
    Process an already-loaded expression matrix DataFrame:

    >>> import polars as pl
    >>> from RNApysoforms import process_expression_matrix
    >>> 
    >>> # Create a sample expression DataFrame
    >>> expr_df = pl.DataFrame({
    ...     "transcript_id": ["tx1", "tx2", "tx3"],
    ...     "gene_id": ["gene1", "gene1", "gene2"],
    ...     "sample1": [100, 150, 300],
    ...     "sample2": [200, 250, 400]
    ... })
    >>> 
    >>> # Process the DataFrame
    >>> df = process_expression_matrix(
    ...     expression_df=expr_df,
    ...     expression_measure_name="counts",
    ...     cpm_normalization=True,
    ...     relative_abundance=True
    ... )
    >>> print(df.head())

    Process with metadata:

    >>> # Create sample metadata DataFrame
    >>> metadata_df = pl.DataFrame({
    ...     "sample_id": ["sample1", "sample2"],
    ...     "condition": ["control", "treated"]
    ... })
    >>> 
    >>> # Process with metadata
    >>> df = process_expression_matrix(
    ...     expression_df=expr_df,
    ...     metadata_df=metadata_df,
    ...     cpm_normalization=True,
    ...     relative_abundance=True
    ... )
    >>> print(df.head())

    Notes
    -----
    - The `transcript_id_column_name` is set to "transcript_id" by default. The parameter is required and cannot be None.
    - If CPM normalization is performed, the expression measures will be scaled to reflect Counts Per Million for each sample.
    - If `gene_id_column_name` is provided and `relative_abundance` is True, relative transcript abundance is calculated as 
      `(transcript_expression / total_gene_expression) * 100`. If the total gene counts are zero, the relative abundance is set to zero 
      to avoid division by zero errors.
    - Warnings are raised if there is only partial sample overlap between expression data and metadata.
    - The resulting DataFrame is returned in long format, with expression measures, CPM values, and relative abundance for each 
      sample-feature combination.
    - Beware of using the `cpm_normalization` and `relative_abundance` options set to `True` when working with a non-raw (i.e., normalized) 
      counts matrix as those results may not be accurate causing misinterpretation.

    See Also
    --------
    read_expression_matrix : Load and process expression data from a file.
    """

    # Check if transcript_id_column_name is None and raise an error if so
    if transcript_id_column_name is None:
        raise ValueError("The 'transcript_id_column_name' is required and cannot be None.")

    # Build a list of feature ID columns to keep fixed during data transformation
    feature_id_columns = [transcript_id_column_name]
    if gene_id_column_name is not None:
        feature_id_columns.append(gene_id_column_name)

    # Check if required feature ID columns are present in the expression DataFrame
    missing_columns = [col for col in feature_id_columns if col not in expression_df.columns]
    if missing_columns:
        raise ValueError(f"The following feature ID columns are missing in the expression dataframe: {missing_columns}")

    # Determine the expression columns by excluding feature ID columns
    expression_columns = [col for col in expression_df.columns if col not in feature_id_columns]

    # Check that expression columns are numeric
    non_numeric_columns = [col for col in expression_columns if not expression_df[col].dtype.is_numeric()]
    if non_numeric_columns:
        raise ValueError(f"The following columns are expected to be numerical but are not: {non_numeric_columns}")

    # Calculate relative transcript abundance if gene_id_column_name is provided and relative_abundance is True
    if relative_abundance and gene_id_column_name is not None:
        # Calculate total gene counts for each gene across transcripts
        expression_df = expression_df.with_columns([
            (
                pl.when(pl.col(col).sum().over(gene_id_column_name) == 0)
                .then(0)
                .otherwise((pl.col(col) / pl.col(col).sum().over(gene_id_column_name)) * 100)
                .alias(col + "_relative_abundance")
            )
            for col in expression_columns
        ])
    # If relative_abundance is True but gene_id_column_name is None, issue a warning
    elif relative_abundance and gene_id_column_name is None:
        warnings.warn(
            "relative_abundance was set to True, but gene_id_column_name was not provided (set to None). "
            "Therefore, relative abundance calculation is being skipped.",
            UserWarning
        )

    # Perform CPM normalization if requested
    if cpm_normalization:
        # Calculate total counts for each sample and scale counts to Counts Per Million (CPM)
        expression_df = expression_df.with_columns([
            (
                (pl.col(col) / pl.col(col).sum()) * 1e6
            ).alias(col + "_CPM")
            for col in expression_columns
        ])

    # Transform the expression DataFrame into long format for expression_measure_name
    expression_long = expression_df.unpivot(
        index=feature_id_columns,
        on=expression_columns,
        variable_name=metadata_sample_id_column,
        value_name=expression_measure_name
    )

    # Initialize long_expression_df with expression_long
    long_expression_df = expression_long

    # If CPM normalization was performed, melt CPM columns and join them
    if cpm_normalization:
        cpm_columns = [col + "_CPM" for col in expression_columns]

        cpm_long = expression_df.unpivot(
            index=feature_id_columns,
            on=cpm_columns,
            variable_name=metadata_sample_id_column,
            value_name="CPM"
        ).with_columns(
            # Remove the '_CPM' suffix from sample IDs if present
            pl.col(metadata_sample_id_column).str.replace(r"_CPM$", "")
        )

        # Join the CPM values to the long_expression_df
        long_expression_df = long_expression_df.join(
            cpm_long,
            on=feature_id_columns + [metadata_sample_id_column],
            how="left"
        )

    # If relative abundance was calculated, melt and join the relative abundance columns
    if relative_abundance and gene_id_column_name is not None:
        relative_abundance_columns = [col + "_relative_abundance" for col in expression_columns]

        relative_abundance_long = expression_df.unpivot(
            index=feature_id_columns,
            on=relative_abundance_columns,
            variable_name=metadata_sample_id_column,
            value_name="relative_abundance"
        ).with_columns(
            # Remove the '_relative_abundance' suffix from sample IDs if present
            pl.col(metadata_sample_id_column).str.replace(r"_relative_abundance$", "")
        )

        # Join the relative abundance values to the long_expression_df
        long_expression_df = long_expression_df.join(
            relative_abundance_long,
            on=feature_id_columns + [metadata_sample_id_column],
            how="left"
        )

    # If metadata_df is provided, merge metadata
    if metadata_df is not None:
        # Check if metadata_sample_id_column is present in metadata_df
        if metadata_sample_id_column not in metadata_df.columns:
            raise ValueError(
                f"The metadata_sample_id_column '{metadata_sample_id_column}' is not present in the metadata dataframe."
            )

        # Get unique sample IDs from expression data and metadata
        expression_sample_ids = long_expression_df[metadata_sample_id_column].unique().to_list()
        metadata_sample_ids = metadata_df[metadata_sample_id_column].unique().to_list()

        # Find overlapping sample IDs between expression data and metadata
        overlapping_sample_ids = set(expression_sample_ids).intersection(set(metadata_sample_ids))

        if not overlapping_sample_ids:
            raise ValueError("No overlapping sample IDs found between expression data and metadata.")

        # Warn about sample ID mismatches
        metadata_sample_ids_not_in_expression = set(metadata_sample_ids) - set(expression_sample_ids)
        expression_sample_ids_not_in_metadata = set(expression_sample_ids) - set(metadata_sample_ids)

        warning_message = ""
        if metadata_sample_ids_not_in_expression:
            warning_message += (f"The following sample IDs are present in metadata but not in expression data: {list(metadata_sample_ids_not_in_expression)}. "
                                "Only returning sample IDs that are present in both.")
        if expression_sample_ids_not_in_metadata:
            warning_message += (f"The following sample IDs are present in expression data but not in metadata: {list(expression_sample_ids_not_in_metadata)}. "
                                "Only returning sample IDs that are present in both.")
        if warning_message:
            warnings.warn(warning_message)

        # Merge metadata with the long_expression_df on metadata_sample_id_column
        long_expression_df = long_expression_df.join(
            metadata_df,
            on=metadata_sample_id_column,
            how="inner"
        )

    # Return the final long-format DataFrame
    return long_expression_df



[docs]
def read_expression_matrix(
    expression_matrix_path: str,
    metadata_path: Optional[str] = None,
    expression_measure_name: str = "counts",
    cpm_normalization: bool = False,
    relative_abundance: bool = False,
    gene_id_column_name: Optional[str] = "gene_id",
    transcript_id_column_name: str = "transcript_id",
    metadata_sample_id_column: str = "sample_id"
) -> pl.DataFrame:
    """
    Loads and processes an expression matrix, optionally merging with metadata, performing CPM normalization, and calculating relative transcript abundance.

    This function reads an expression matrix file and, optionally, a metadata file, merging the two on a specified sample identifier column.
    It supports performing Counts Per Million (CPM) normalization and calculating relative transcript abundance based on gene counts.
    The resulting DataFrame is returned in long format, including the expression measures, optional CPM values, relative abundances, and metadata if provided.

    **Expression Matrix Format Requirements:**
    - Must be in wide format with samples as columns and transcripts as rows.
    - The first column(s) must contain the feature identifiers (transcript_id and, optionally, gene_id).
    - Sample columns must contain numeric expression values (e.g., read counts).
    - Sample column names must exactly match the values in the metadata_sample_id_column for proper merging if providing a metadata file.
    
    Example expression matrix format::

        transcript_id    gene_id    sample1    sample2    sample3
        ENST0001         ENSG001    100        200        150
        ENST0002         ENSG001    50         75         60
        ENST0003         ENSG002    300        250        275

    **Required Columns in Expression Matrix:**
    - `transcript_id_column_name` (default "transcript_id"): Identifier for each transcript.
    - If `gene_id_column_name` is provided and not None, it must be a column in the expression matrix.
    - All other columns are assumed to be sample expression values and must:
        - Contain numeric values.
        - Have names that match the values in the metadata_sample_id_column if metadata is provided.

    **Metadata Format Requirements (if provided):**
    - Must contain the metadata_sample_id_column (default "sample_id").
    - Values in this column must exactly match the sample column names in the expression matrix.
    - Can contain any number of additional metadata columns.

    Example metadata format::   

        sample_id    condition    batch
        sample1      control      1
        sample2      treated      1
        sample3      control      2


    **Supported File Formats:**
    - `.csv`, `.tsv`, `.txt`, `.parquet`, `.xlsx` for both expression matrix and metadata files.

    Parameters
    ----------
    expression_matrix_path : str
        Path to the expression matrix file. Supported file formats include `.csv`, `.tsv`, `.txt`, `.parquet`, and `.xlsx`.
    metadata_path : str, optional
        Path to the metadata file. If provided, the metadata will be merged with the expression data on the specified sample identifier column.
        Supported file formats are the same as for `expression_matrix_path`. Default is None.
    expression_measure_name : str, optional
        The name to assign to the expression measure column after melting. This will be the name of the column containing the expression values in the long-format DataFrame. Default is `"counts"`.
    cpm_normalization : bool, optional
        Whether to perform Counts Per Million (CPM) normalization on the expression data. If True, CPM values will be calculated for each sample. Default is False.
    relative_abundance : bool, optional
        Whether to calculate relative transcript abundance based on gene counts. Requires `gene_id_column_name` to be provided and not None. Default is False.
    gene_id_column_name : str, optional
        The name of the column in the expression DataFrame that contains gene identifiers. This column will remain fixed during data transformation.
        If provided and `relative_abundance` is True, relative transcript abundance will be calculated. Default is `"gene_id"`. If set to None, the gene identifier will not be used.
    transcript_id_column_name : str
        The name of the column in the expression DataFrame that contains transcript identifiers. This parameter is required and cannot be None. Default is `"transcript_id"`.
    metadata_sample_id_column : str, optional
        Column name in the metadata DataFrame that identifies samples. This column is used to merge the metadata and expression data. Default is `"sample_id"`.

    Returns
    -------
    pl.DataFrame
        A Polars DataFrame in long format containing the expression data, and optionally CPM values, relative abundances, and metadata.

    Raises
    ------
    ValueError
        If `transcript_id_column_name` is None.
        If required feature ID columns are missing in the expression matrix.
        If the expression columns are not numeric.
        If required columns are missing in the metadata DataFrame.
        If there are no overlapping sample IDs between expression data and metadata.
        If the file format is unsupported or the file cannot be read.

    Warns
    -----
    UserWarning
        If `relative_abundance` is True but `gene_id_column_name` is None.
        If there is partial overlap of sample IDs between expression data and metadata.

    Examples
    --------
    Load an expression matrix, perform CPM normalization, calculate relative transcript abundance, and merge with metadata:

    >>> from RNApysoforms import read_expression_matrix
    >>> df = read_expression_matrix(
    ...     expression_matrix_path="counts.csv",
    ...     metadata_path="metadata.csv",
    ...     expression_measure_name="counts",
    ...     cpm_normalization=True,
    ...     relative_abundance=True
    ... )
    >>> print(df.head())

    Notes
    -----
    - The `transcript_id_column_name` is set to "transcript_id" by default. The parameter is required and cannot be None.
    - The function supports multiple file formats (`.csv`, `.tsv`, `.txt`, `.parquet`, `.xlsx`) for both expression and metadata files.
    - If CPM normalization is performed, the expression measures will be scaled to reflect Counts Per Million for each sample.
    - If `gene_id_column_name` is provided and `relative_abundance` is True, relative transcript abundance is calculated as 
      `(transcript_expression / total_gene_expression) * 100`. If the total gene counts are zero, the relative abundance is set to zero to avoid division by zero errors.
    - Warnings are raised if there is only partial sample overlap between expression data and metadata.
    - The resulting DataFrame is returned in long format, with expression measures, CPM values, relative abundance for each sample-feature combination.
    - The `expression_measure_name` allows customization of the name of the expression values column in the long-format DataFrame.
    - If a metadata file is passed, the function expects that the values in the `metadata_sample_id_column` from the metadata file will be found as 
      column names in the counts matrix file.
    - Beware of using the `cpm_normalization` and `relative_abundance` options set to `True` when working with a non-raw (i.e., normalized) counts
      matrix as those results may not be accurate causing misinterpretation.
    - An example counts matrix file can be found here: https://github.com/UK-SBCoA-EbbertLab/RNApysoforms/blob/main/tests/test_data/counts_matrix_chr21_and_Y.tsv
    - An example metadata file can be found here: https://github.com/UK-SBCoA-EbbertLab/RNApysoforms/blob/main/tests/test_data/sample_metadata.tsv

    See Also
    --------
    process_expression_matrix : Process an already-loaded expression DataFrame without reading from a file.
    """

    # Load the expression matrix file using the helper function
    expression_df = _get_open_file(expression_matrix_path)

    # If metadata_path is provided, load the metadata file
    metadata_df = None
    if metadata_path is not None:
        metadata_df = _get_open_file(metadata_path)

    # Process the loaded DataFrames using the process_expression_matrix function
    return process_expression_matrix(
        expression_df=expression_df,
        metadata_df=metadata_df,
        expression_measure_name=expression_measure_name,
        cpm_normalization=cpm_normalization,
        relative_abundance=relative_abundance,
        gene_id_column_name=gene_id_column_name,
        transcript_id_column_name=transcript_id_column_name,
        metadata_sample_id_column=metadata_sample_id_column
    )


def _get_open_file(file_path: str) -> pl.DataFrame:
    """
    Opens a file based on its extension and loads it into a Polars DataFrame.

    This helper function supports multiple file formats such as `.csv`, `.tsv`, `.txt`, `.parquet`, and `.xlsx`.
    It automatically determines the correct method to open the file based on its extension.

    Parameters
    ----------
    file_path : str
        The path to the file to be opened.

    Returns
    -------
    pl.DataFrame
        A Polars DataFrame containing the contents of the file.

    Raises
    ------
    ValueError
        If the file extension is unsupported.
        If the file cannot be read due to an error.

    Examples
    --------
    Open a CSV file:

    >>> df = _get_open_file("data.csv")

    Open a TSV file:

    >>> df = _get_open_file("data.tsv")

    Notes
    -----
    - This function is used internally by `read_expression_matrix` to load expression and metadata files.
    - It handles different file extensions and raises a clear error if the file format is unsupported.
    """

    # Extract the file extension to determine the file format
    _, file_extension = os.path.splitext(file_path)

    try:
        # Open the file based on its extension
        if file_extension in [".tsv", ".txt"]:
            # Read tab-separated values
            return pl.read_csv(file_path, separator="\t", infer_schema_length=100000)
        elif file_extension == ".csv":
            # Read comma-separated values
            return pl.read_csv(file_path, infer_schema_length=100000)
        elif file_extension == ".parquet":
            # Read Parquet file
            return pl.read_parquet(file_path)
        elif file_extension == ".xlsx":
            # Read Excel file
            return pl.read_excel(file_path, infer_schema_length=100000)
        else:
            # Raise an error for unsupported file extensions
            raise ValueError(
                f"Unsupported file extension '{file_extension}'. Supported extensions are .tsv, .txt, .csv, .parquet, .xlsx"
            )
    except Exception as e:
        # Raise an error if the file cannot be read
        raise ValueError(f"Failed to read the file '{file_path}': {e}")