Source code for spared.gene_features.gene_features

import anndata as ad
import numpy as np
import pandas as pd
import pathlib
import squidpy as sq
import sys

# El path a spared es ahora diferente
SPARED_PATH = pathlib.Path(__file__).resolve().parent.parent
# Agregar el directorio padre al sys.path para los imports
sys.path.append(str(SPARED_PATH))
# Import im_encoder.py file
from filtering import filtering
# Remove the path from sys.path
sys.path.remove(str(SPARED_PATH))

[docs]def get_exp_frac(adata: ad.AnnData) -> ad.AnnData:
    """ Compute the expression fraction for all genes.

    The expression fraction of a gene in a slide is defined as the proportion of spots where that gene is expressed. It is a number between ``0.0``
    and ``1.0`` where ``0.0`` means that the gene is not expressed in any spot and ``1.0`` means that the gene is expressed in all the spots.

    To compute an aggregation of expression fractions in a complete dataset, this function gets the
    expression fraction for each slide and then takes the minimum across all the slides. Hence the final number is a lower bound that ensures
    that the gene is expressed in at least that fraction of the spots in each one of the slides.

    Args:
        adata (ad.AnnData): A slide collection where non-expressed genes have a value of ``0`` in the ``adata.X`` matrix.

    Returns:
        ad.AnnData: The updated slide collection with the added information into the ``adata.var['exp_frac']`` column.
    """
    # Get the unique slide ids
    slide_ids = adata.obs['slide_id'].unique()

    # Define zeros matrix of shape (n_genes, n_slides)
    exp_frac = np.zeros((adata.n_vars, len(slide_ids)))

    # Iterate over the slide ids
    for i, slide_id in enumerate(slide_ids):
        # Get current slide adata
        slide_adata = adata[adata.obs['slide_id'] == slide_id, :]
        # Get current slide expression fraction
        curr_exp_frac = np.squeeze(np.asarray((slide_adata.X > 0).sum(axis=0) / slide_adata.n_obs))
        # Add current slide expression fraction to the matrix
        exp_frac[:, i] = curr_exp_frac
    
    # Compute the minimum expression fraction for each gene across all the slides
    min_exp_frac = np.min(exp_frac, axis=1)

    # Add the minimum expression fraction to the var dataframe of the slide collection
    adata.var['exp_frac'] = min_exp_frac

    # Return the adata
    return adata

# TODO: Add reference for get_exp_frac function when mentioning the differences
[docs]def get_glob_exp_frac(adata: ad.AnnData) -> ad.AnnData:
    """ Compute the global expression fraction for all genes.
    
    This function computes the global expression fraction for each gene in a dataset.

    The global expression fraction of a gene in a dataset is defined as the proportion of spots where that gene is expressed. It is a number between ``0.0``
    and ``1.0`` where ``0.0`` means that the gene is not expressed in any spot and ``1.0`` means that the gene is expressed in all the spots. Its difference
    with the expression fraction is that the global expression fraction is computed for the whole dataset and not for each slide.

    Args:
        adata (ad.AnnData): A slide collection where a non-expressed genes have a value of ``0`` in the ``adata.X`` matrix.

    Returns:
        ad.AnnData: The updated slide collection with the information added into the  ``adata.var['glob_exp_frac']`` column.
    """
    # Get global expression fraction
    glob_exp_frac = np.squeeze(np.asarray((adata.X > 0).sum(axis=0) / adata.n_obs))

    # Add the global expression fraction to the var dataframe of the slide collection
    adata.var['glob_exp_frac'] = glob_exp_frac

    # Return the adata
    return adata

# TODO: Add link to what the moran's I statistic is in wikipedia or something
[docs]def compute_moran(adata: ad.AnnData, from_layer: str, hex_geometry: bool) -> ad.AnnData:
    """Compute Moran's I statistic for each gene.

    Compute average Moran's I statistic for a collection of slides. Internally cycles over each slide in the ``adata`` collection
    and computes the Moran's I statistic for each gene. After that, it averages the Moran's I for each gene across all
    slides and saves it in ``adata.var[f'{from_layer}_moran']``.The input data for the Moran's I computation is ``adata.layers[from_layer]``.

    Args:
        adata (ad.AnnData): The AnnData object to update. Must have expression values in ``adata.layers[from_layer]``.
        from_layer (str): The key in ``adata.layers`` with the values used to compute Moran's I.
        hex_geometry (bool): Whether the geometry is hexagonal or not. This is used to compute the spatial neighbors before computing Moran's I. Only ``True`` for visium datasets.

    Returns:
        ad.AnnData: The updated AnnData object with the average Moran's I for each gene in ``adata.var[f'{from_layer}_moran']``.
    """
    print(f'Computing Moran\'s I for each gene over each slide using data of layer {from_layer}...')

    # Get the unique slide_ids
    slide_ids = adata.obs['slide_id'].unique()

    # Create a dataframe to store the Moran's I for each slide
    moran_df = pd.DataFrame(index = adata.var.index, columns=slide_ids)

    # Cycle over each slide
    for slide in slide_ids:
        # Get the annData for the current slide
        slide_adata = filtering.get_slide_from_collection(adata, slide)
        # Compute spatial_neighbors
        if hex_geometry:
            # Hexagonal visium case
            sq.gr.spatial_neighbors(slide_adata, coord_type='generic', n_neighs=6)
        else:
            # Grid STNet dataset case
            sq.gr.spatial_neighbors(slide_adata, coord_type='grid', n_neighs=8)
        # Compute Moran's I
        sq.gr.spatial_autocorr(
            slide_adata,
            mode="moran",
            layer=from_layer,
            genes=slide_adata.var_names,
            n_perms=1000,
            n_jobs=-1,
            seed=42
        )

        # Get moran I
        moranI = slide_adata.uns['moranI']['I']
        # Reindex moranI to match the order of the genes in the adata object
        moranI = moranI.reindex(adata.var.index)

        # Add the Moran's I to the dataframe
        moran_df[slide] = moranI

    # Compute the average Moran's I for each gene
    adata.var[f'{from_layer}_moran'] = moran_df.mean(axis=1)

    # Return the updated AnnData object
    return adata