Source code for spared.layer_operations.layer_operations

import anndata as ad
import os
from time import time
import numpy as np
import pandas as pd
import pathlib
import shutil
import wget
import gzip
import subprocess
from combat.pycombat import pycombat
import scanpy as sc
from sklearn.preprocessing import StandardScaler
import sys

# Path a spared 
SPARED_PATH = pathlib.Path(__file__).resolve().parent.parent
# Agregar el directorio padre al sys.path para los imports
sys.path.append(str(SPARED_PATH))
# Import im_encoder.py file
from denoising import denoising
from gene_features import gene_features
from filtering import filtering
# Remove the path from sys.path
sys.path.remove(str(SPARED_PATH))


### Expression data processing functions:
# TODO: CHange here and elsewhere the order of the organism and adata parameters
[docs]def tpm_normalization(organism: str, adata: ad.AnnData, from_layer: str, to_layer: str) -> ad.AnnData: """Normalize expression using TPM normalization. This function applies TPM normalization to an AnnData object with raw counts. It also removes genes that are not fount in the ``.gtf`` annotation file. The counts are taken from ``adata.layers[from_layer]`` and the results are stored in ``adata.layers[to_layer]``. It can perform the normalization for `human <https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.basic.annotation.gtf.gz>`_ and `mouse <https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M33/gencode.vM33.basic.annotation.gtf.gz>`_ reference genomes. To specify which GTF annotation file should be used, the ``'organism'`` parameter must be ``'mouse'`` or ``'human'``. Args: adata (ad.Anndata): The Anndata object to normalize. organism (str): Organism of the dataset. Must be ``'mouse'`` or ``'human'``. from_layer (str): The layer to take the counts from. The data in this layer should be in raw counts. to_layer (str): The layer to store the results of the normalization. Returns: ad.Anndata: The updated Anndata object with TPM values in ``adata.layers[to_layer]``. """ # Get the number of genes before filtering initial_genes = adata.shape[1] # Automatically download the human gtf annotation file if it is not already downloaded if not os.path.exists(os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.v43.basic.annotation.gtf.gz')): print('Automatically downloading human gtf annotation file...') os.makedirs(os.path.join(SPARED_PATH, 'data', 'annotations'), exist_ok=True) wget.download( 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.basic.annotation.gtf.gz', out = os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.v43.basic.annotation.gtf.gz')) # Define gtf human path gtf_path = os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.v43.basic.annotation.gtf') # Unzip the data in annotations folder if it is not already unzipped if not os.path.exists(gtf_path): with gzip.open(os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.v43.basic.annotation.gtf.gz'), 'rb') as f_in: with open(gtf_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # FIXME: Set up automatic download of mouse gtf file (DONE) # Automatically download the mouse gtf annotation file if it is not already downloaded if not os.path.exists(os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.vM33.basic.annotation.gtf.gz')): print('Automatically downloading mouse gtf annotation file...') os.makedirs(os.path.join(SPARED_PATH, 'data', 'annotations'), exist_ok=True) wget.download( 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M33/gencode.vM33.basic.annotation.gtf.gz', out = os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.vM33.basic.annotation.gtf.gz')) # Define gtf mouse path gtf_path_mouse = os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.vM33.basic.annotation.gtf') # Unzip the data in annotations folder if it is not already unzipped if not os.path.exists(gtf_path_mouse): with gzip.open(os.path.join(SPARED_PATH, 'data', 'annotations', 'gencode.vM33.basic.annotation.gtf.gz'), 'rb') as f_in: with open(gtf_path_mouse, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # Obtain a txt with gene lengths gene_length_path = os.path.join(SPARED_PATH, 'data', 'annotations', 'gene_length.txt') if not os.path.exists(gene_length_path): command = f'python {os.path.join(SPARED_PATH, "gtftools.py")} -l {gene_length_path} {gtf_path}' command_list = command.split(' ') subprocess.call(command_list) gene_length_path_mouse = os.path.join(SPARED_PATH, 'data', 'annotations', 'gene_length_mouse.txt') if not os.path.exists(gene_length_path_mouse): command = f'python {os.path.join(SPARED_PATH, "gtftools.py")} -l {gene_length_path_mouse} {gtf_path_mouse}' command_list = command.split(' ') subprocess.call(command_list) # Upload the gene lengths if organism.lower() == "mouse": glength_df = pd.read_csv(gene_length_path_mouse, delimiter='\t', usecols=['gene', 'merged']) elif organism.lower() == "human": glength_df = pd.read_csv(gene_length_path, delimiter='\t', usecols=['gene', 'merged']) else: assert "Organism not valid" # For the gene column, remove the version number glength_df['gene'] = glength_df['gene'].str.split('.').str[0] # Drop gene duplicates. NOTE: This only eliminates 40/60k genes so it is not a big deal glength_df = glength_df.drop_duplicates(subset=['gene']) # Find the genes that are in the gtf annotation file common_genes=list(set(adata.var_names)&set(glength_df["gene"])) # Subset both adata and glength_df to keep only the common genes adata = adata[:, common_genes].copy() glength_df = glength_df[glength_df["gene"].isin(common_genes)].copy() # Reindex the glength_df to genes glength_df = glength_df.set_index('gene') # Reindex glength_df to adata.var_names glength_df = glength_df.reindex(adata.var_names) # Assert indexes of adata.var and glength_df are the same assert (adata.var.index == glength_df.index).all() # Add gene lengths to adata.var adata.var['gene_length'] = glength_df['merged'].values # Divide each column of the counts matrix by the gene length. Save the result in layer "to_layer" adata.layers[to_layer] = adata.layers[from_layer] / adata.var['gene_length'].values.reshape(1, -1) # Make that each row sums to 1e6 adata.layers[to_layer] = np.nan_to_num(adata.layers[to_layer] / (np.sum(adata.layers[to_layer], axis=1).reshape(-1, 1)/1e6)) # Pass layer to np.array adata.layers[to_layer] = np.array(adata.layers[to_layer]) # Print the number of genes that were not found in the gtf annotation file failed_genes = initial_genes - adata.n_vars print(f'Number of genes not found in GTF file by TPM normalization: {initial_genes - adata.n_vars} out of {initial_genes} ({100*failed_genes/initial_genes:.2f}%) ({adata.n_vars} remaining)') # Return the transformed AnnData object return adata
[docs]def log1p_transformation(adata: ad.AnnData, from_layer: str, to_layer: str) -> ad.AnnData: """Perform :math:`\log_2(x+1)` transformation Performs logarithmic transformation over ``adata.layers[from_layer]``. Simple wrapper of scanpy's ``sc.pp.log1p()`` (base 2) to transform data from ``adata.layers[from_layer]`` and save it into ``adata.layers[to_layer]``. Args: adata (ad.AnnData): The AnnData object to transform. from_layer (str): The layer to take the data from. to_layer (str): The layer to store the results of the transformation. Returns: ad.AnnData: The updated AnnData object with transformed data in ``adata.layers[to_layer]``. """ # Transform the data with log1p transformed_adata = sc.pp.log1p(adata, base= 2.0, layer=from_layer, copy=True) # Add the log1p transformed data to adata.layers[to_layer] adata.layers[to_layer] = transformed_adata.layers[from_layer] # Return the transformed AnnData object return adata
# FIXME: Update to the new hosting of the pycombat package
[docs]def combat_transformation(adata: ad.AnnData, batch_key: str, from_layer: str, to_layer: str) -> ad.AnnData: """ Perform batch correction with ComBat Compute batch correction using the `pycombat <https://github.com/epigenelabs/pyComBat?tab=readme-ov-file>`_ package. The batches are defined by ``adata.obs[batch_key]`` so the user can define which variable to use as batch identifier. The input data for the batch correction is ``adata.layers[from_layer]`` and the output is stored in ``adata.layers[to_layer]``. Importantly, as the `original ComBat paper <https://doi.org/10.1093/biostatistics/kxj037>`_ notes the data should be approximately normally distributed. Therefore, it is recommended to use this function over :math:`\log_2(TPM+1)` data. Args: adata (ad.AnnData): The AnnData object to transform. Must have logarithmically transformed data in ``adata.layers[from_layer]``. batch_key (str): The column in ``adata.obs`` that defines the batches. from_layer (str): The layer to take the data from. to_layer (str): The layer to store the results of the transformation. Returns: ad.AnnData: The updated AnnData object with batch corrected data in ``adata.layers[to_layer]``. """ # Get expression matrix dataframe df = adata.to_df(layer = from_layer).T batch_list = adata.obs[batch_key].values.tolist() # Apply pycombat batch correction corrected_df = pycombat(df, batch_list, par_prior=True) # Assign batch corrected expression to .layers[to_layer] attribute adata.layers[to_layer] = corrected_df.T return adata
# TODO: Put reference to SEPAL as the first method trying this
[docs]def get_deltas(adata: ad.AnnData, from_layer: str, to_layer: str) -> ad.AnnData: """ Get expression deltas from the mean. Compute the deviations from the mean expression of each gene in ``adata.layers[from_layer]`` and save them in ``adata.layers[to_layer]``. Also add the mean expression of each gene to ``adata.var[f'{from_layer}_avg_exp']``. Average expression is computed using only train data determined by the ``adata.obs['split']`` column. However, deltas are computed for all observations. Args: adata (ad.AnnData): The AnnData object to update. Must have expression values in ``adata.layers[from_layer]``. Must also have the ``adata.obs['split']`` column with ``'train'`` values. from_layer (str): The layer to take the data from. to_layer (str): The layer to store the results of the transformation. Returns: ad.AnnData: The updated AnnData object with the deltas (``adata.layers[to_layer]``) and mean expression (``adata.var[f'{from_layer}_avg_exp']``) information. """ # Get the expression matrix of both train and global data glob_expression = adata.to_df(layer=from_layer) train_expression = adata[adata.obs['split'] == 'train'].to_df(layer=from_layer) # Define scaler scaler = StandardScaler(with_mean=True, with_std=False) # Fit the scaler to the train data scaler = scaler.fit(train_expression) # Get the centered expression matrix of the global data centered_expression = scaler.transform(glob_expression) # Add the deltas to adata.layers[to_layer] adata.layers[to_layer] = centered_expression # Add the mean expression to adata.var[f'{from_layer}_avg_exp'] adata.var[f'{from_layer}_avg_exp'] = scaler.mean_ # Return the updated AnnData object return adata
# TODO: When cheking this function on its own, try using a prediction layer "delta" # TODO: modify function to add noisy layer to important layers # FIXME: Double check the inside of the function. It looks weird
[docs]def add_noisy_layer(adata: ad.AnnData, prediction_layer: str) -> ad.AnnData: """ Add an artificial noisy layer. This function should only be used for experimentation/ablation purposes. The noisy layer is created by returning the missing values to an already denoised layer of the ``adata``. In the case the ``'prediction_layer'`` is on :math:`\log_2(TPM+1)` logarithmic scale, the noisy layer is created by assigning zero values to the missing values (adds ``'noisy'`` layer to the adata). In the case the ``'prediction_layer'`` is on delta scale, the noisy layer is created by assigning the negative mean of the gene to the missing values (adds ``'noisy_d'`` layer to the adata). Missing values are specified by the binary ``adata.layers['mask']`` layer that must be already present and has ``True`` values for all real data and ``False`` values for imputed data. Args: adata (ad.AnnData): The AnnData object to update. Must have the ``adata.layers[prediction_layer]``, the gene means if its a delta layer, and ``adata.layers['mask']``. prediction_layer (str): The layer that will be corrupted to create the noisy layer. Returns: ad.AnnData: The updated AnnData object with the ``adata.layers['noisy']`` or ``adata.layers['noisy_d']`` layer added depending on ``prediction_layer``. """ # FIXME: This condition looks weird if 'delta' in prediction_layer or 'noisy_d' in prediction_layer: # Get vector with gene means gene_means = adata.var[f"{prediction_layer}_avg_exp"].values # Expand gene means to the shape of the layer gene_means = np.repeat(gene_means.reshape(1, -1), adata.n_obs, axis=0) # Get valid mask valid_mask = adata.layers['mask'] # Initialize noisy deltas noisy_deltas = -gene_means delta_key = prediction_layer.split("log1p") # Assign delta values in positions where valid mask is true noisy_deltas[valid_mask] = adata.layers[f'{delta_key[0]}deltas'][valid_mask] # Add the layer to the adata adata.layers['noisy_d'] = noisy_deltas # Add a var column of used means of the layer mean_key = f'{prediction_layer}_avg_exp' adata.var['used_mean'] = adata.var[mean_key] else: # Copy the cleaned layer to the layer noisy noised_layer = adata.layers[prediction_layer].copy() # Get zero mask zero_mask = ~adata.layers['mask'] # Zero out the missing values noised_layer[zero_mask] = 0 # Add the layer to the adata adata.layers['noisy'] = noised_layer # Give warning to say that the noisy layer is being used print('Using noisy_delta layer for training. This will probably yield bad results.') return adata
# TODO: Put reference to the filter_dataset() function # FIXME: Here we should have the transformer cleaner too # TODO: Add combat link
[docs]def process_dataset(adata: ad.AnnData, param_dict: dict) -> ad.AnnData: """ Perform complete processing pipeline. This function performs the complete processing pipeline. It only computes over the expression and filters genes to get the final prediction variables. However, it doesn't perform spot (sample) filtering for which the ``filter_dataset()`` function is recommended. The input data ``adata.X`` is expected to be in raw counts. The processing pipeline is the following: 1. Normalize the data with TPM normalization (adds ``adata.layers['tpm']``) 2. Transform the data with logarithmically using :math:`\log_2(TPM+1)` (adds ``adata.layers['log1p']``) 3. Denoise the data with the adaptive median filter (adds ``adata.layers['d_log1p']``) 4. Compute Moran's I for each gene in each slide and average Moran's I across slides (adds ``adata.var['d_log1p_moran']``) 5. Filter dataset to keep the top ``param_dict['top_moran_genes']`` genes with highest Moran's I. 6. Perform ComBat batch correction if specified by the ``param_dict['combat_key']`` parameter (adds ``adata.layers['c_d_log1p']``) 7. Compute the deltas from the mean for each gene. Computed from ``log1p``, ``d_log1p`` and ``c_log1p``, ``c_d_log1p`` layer if batch correction was performed (adds ``deltas``, ``d_deltas``, ``c_deltas``, ``c_d_deltas`` layers) 8. Add a binary mask layer specifying valid observations for metric computation (adds ``adata.layers['mask']``, ``True`` for valid observations, ``False`` for missing values). Args: adata (ad.AnnData): The AnnData object to process. Should be already spot/sample filtered.. param_dict (dict): Dictionary that contains filtering and processing parameters. Keys that must be present are: - ``'top_moran_genes'`` (*int*): The number of genes to keep after filtering by Moran's I. If set to ``0``, then the number of genes is internally computed. - ``'combat_key'`` (*str*): The column in ``adata.obs`` that defines the batches for ComBat batch correction. If set to ``'None'``, then no batch correction is performed. - ``'hex_geometry'`` (*bool*): Whether the graph is hexagonal or not. If ``True``, then the graph is hexagonal. If ``False``, then the graph is a grid. Only ``True`` for Visium datasets. Returns: ad.Anndata: The processed AnnData object with all the layers and results added. A list of included keys in ``adata.layers`` is: - ``'counts'``: Raw counts of the dataset. - ``'tpm'``: TPM normalized data. - ``'log1p'``: :math:`\log_2(TPM+1)` transformed data. - ``'d_log1p'``: Denoised data with adaptive median filter. - ``'c_log1p'``: Batch corrected data with ComBat (only if ``param_dict['combat_key'] != 'None'``). - ``'c_d_log1p'``: Batch corrected and denoised data with adaptive median filter (only if ``param_dict['combat_key'] != 'None'``). - ``'deltas'``: Deltas from the mean expression for ``log1p``. - ``'d_deltas'``: Deltas from the mean expression for ``d_log1p``. - ``'c_deltas'``: Deltas from the mean expression for ``c_log1p`` (only if ``param_dict['combat_key'] != 'None'``). - ``'c_d_deltas'``: Deltas from the mean expression for ``c_d_log1p`` (only if ``param_dict['combat_key'] != 'None'``). - ``'mask'``: Binary mask layer. ``True`` for valid observations, ``False`` for imputed missing values. """ ### Compute all the processing steps # NOTE: The d prefix stands for denoised # NOTE: The c prefix stands for combat corrected # Start the timer and print the start message start = time() print('Starting data processing...') # First add raw counts to adata.layers['counts'] adata.layers['counts'] = adata.X.toarray() # 1. Make TPM normalization adata = tpm_normalization(param_dict["organism"], adata, from_layer='counts', to_layer='tpm') # 2. Transform the data with log1p (base 2.0) adata = log1p_transformation(adata, from_layer='tpm', to_layer='log1p') # 3. Denoise the data with adaptive median filter adata = denoising.median_cleaner(adata, from_layer='log1p', to_layer='d_log1p', n_hops=4, hex_geometry=param_dict["hex_geometry"]) # 4. Compute average moran for each gene in the layer d_log1p adata = gene_features.compute_moran(adata, hex_geometry=param_dict["hex_geometry"], from_layer='d_log1p') # 5. Filter genes by Moran's I adata = filtering.filter_by_moran(adata, n_keep=param_dict['top_moran_genes'], from_layer='d_log1p') # 6. If combat key is specified, apply batch correction if param_dict['combat_key'] != 'None': adata = combat_transformation(adata, batch_key=param_dict['combat_key'], from_layer='log1p', to_layer='c_log1p') adata = combat_transformation(adata, batch_key=param_dict['combat_key'], from_layer='d_log1p', to_layer='c_d_log1p') # 7. Compute deltas and mean expression for all log1p, d_log1p, c_log1p and c_d_log1p adata = get_deltas(adata, from_layer='log1p', to_layer='deltas') adata = get_deltas(adata, from_layer='d_log1p', to_layer='d_deltas') if param_dict['combat_key'] != 'None': adata = get_deltas(adata, from_layer='c_log1p', to_layer='c_deltas') adata = get_deltas(adata, from_layer='c_d_log1p', to_layer='c_d_deltas') # 8. Add a binary mask layer specifying valid observations for metric computation adata.layers['mask'] = adata.layers['tpm'] != 0 # TODO: add spackle imputation and deltas # Print with the percentage of the dataset that was replaced print('Percentage of imputed observations with median filter: {:5.3f}%'.format(100 * (~adata.layers["mask"]).sum() / (adata.n_vars*adata.n_obs))) # Print the number of cells and genes in the dataset print(f'Processing of the data took {time() - start:.2f} seconds') print(f'The processed dataset looks like this:') print(adata) return adata