Source code for CPAC.cwas.cwas

import os

import nibabel as nb
import numpy as np
import pandas as pd
import scipy.stats
from scipy.stats import t
from numpy import inf

from CPAC.cwas.mdmr import mdmr
from CPAC.utils import correlation

from CPAC.pipeline.cpac_ga_model_generator import (create_merge_mask,
                                                   create_merged_copefile)


[docs]def joint_mask(subjects, mask_file=None): """ Creates a joint mask (intersection) common to all the subjects in a provided list and a provided mask Parameters ---------- subjects : dict of strings A length `N` list of file paths of the nifti files of subjects mask_file : string Path to a mask file in nifti format Returns ------- joint_mask : string Path to joint mask file in nifti format """ if not mask_file: files = list(subjects.values()) cope_file = os.path.join(os.getcwd(), 'joint_cope.nii.gz') mask_file = os.path.join(os.getcwd(), 'joint_mask.nii.gz') create_merged_copefile(files, cope_file) create_merge_mask(cope_file, mask_file) return mask_file
def calc_mdmrs(D, regressor, cols, permutations): cols = np.array(cols, dtype=np.int32) F_set, p_set = mdmr(D, regressor, cols, permutations) return F_set, p_set def calc_subdists(subjects_data, voxel_range): subjects, voxels, _ = subjects_data.shape D = np.zeros((len(voxel_range), subjects, subjects)) for i, v in enumerate(voxel_range): profiles = np.zeros((subjects, voxels)) for si in range(subjects): profiles[si] = correlation(subjects_data[si, v], subjects_data[si]) profiles = np.clip(np.nan_to_num(profiles), -0.9999, 0.9999) profiles = np.arctanh(np.delete(profiles, v, 1)) D[i] = correlation(profiles, profiles) D = np.sqrt(2.0 * (1.0 - D)) return D def calc_cwas(subjects_data, regressor, regressor_selected_cols, permutations, voxel_range): D = calc_subdists(subjects_data, voxel_range) F_set, p_set = calc_mdmrs( D, regressor, regressor_selected_cols, permutations) return F_set, p_set def pval_to_zval(p_set, permu): inv_pval = 1 - p_set zvals = t.ppf(inv_pval, (len(p_set) - 1)) zvals[zvals == -inf] = permu / (permu + 1) zvals[zvals == inf] = permu / (permu + 1) return zvals
[docs]def nifti_cwas(subjects, mask_file, regressor_file, participant_column, columns_string, permutations, voxel_range): """ Performs CWAS for a group of subjects Parameters ---------- subjects : dict of strings:strings A length `N` dict of id and file paths of the nifti files of subjects mask_file : string Path to a mask file in nifti format regressor_file : string file path to regressor CSV or TSV file (phenotypic info) columns_string : string comma-separated string of regressor labels permutations : integer Number of pseudo f values to sample using a random permutation test voxel_range : ndarray Indexes from range of voxels (inside the mask) to perform cwas on. Index ordering is based on the np.where(mask) command Returns ------- F_file : string .npy file of pseudo-F statistic calculated for every voxel p_file : string .npy file of significance probabilities of pseudo-F values voxel_range : tuple Passed on by the voxel_range provided in parameters, used to make parallelization easier """ try: regressor_data = pd.read_table(regressor_file, sep=None, engine="python", dtype={ participant_column: str }) except: regressor_data = pd.read_table(regressor_file, sep=None, engine="python") regressor_data = regressor_data.astype({ participant_column: str }) # drop duplicates regressor_data = regressor_data.drop_duplicates() regressor_cols = list(regressor_data.columns) if not participant_column in regressor_cols: raise ValueError('Participant column was not found in regressor file.') if participant_column in columns_string: raise ValueError('Participant column can not be a regressor.') subject_ids = list(subjects.keys()) subject_files = list(subjects.values()) # check for inconsistency with leading zeroes # (sometimes, the sub_ids from individual will be something like # '0002601' and the phenotype will have '2601') for index, row in regressor_data.iterrows(): pheno_sub_id = str(row[participant_column]) for sub_id in subject_ids: if str(sub_id).lstrip('0') == str(pheno_sub_id): regressor_data.at[index, participant_column] = str(sub_id) regressor_data.index = regressor_data[participant_column] # Keep only data from specific subjects ordered_regressor_data = regressor_data.loc[subject_ids] columns = columns_string.split(',') regressor_selected_cols = [ i for i, c in enumerate(regressor_cols) if c in columns ] if len(regressor_selected_cols) == 0: regressor_selected_cols = [i for i, c in enumerate(regressor_cols)] regressor_selected_cols = np.array(regressor_selected_cols) # Remove participant id column from the dataframe and convert it to a numpy matrix regressor = ordered_regressor_data \ .drop(columns=[participant_column]) \ .reset_index(drop=True) \ .values \ .astype(np.float64) if len(regressor.shape) == 1: regressor = regressor[:, np.newaxis] elif len(regressor.shape) != 2: raise ValueError('Bad regressor shape: %s' % str(regressor.shape)) if len(subject_files) != regressor.shape[0]: raise ValueError('Number of subjects does not match regressor size') mask = nb.load(mask_file).get_fdata().astype('bool') mask_indices = np.where(mask) subjects_data = np.array([ nb.load(subject_file).get_fdata().astype('float64')[mask_indices] for subject_file in subject_files ]) F_set, p_set = calc_cwas(subjects_data, regressor, regressor_selected_cols, permutations, voxel_range) cwd = os.getcwd() F_file = os.path.join(cwd, 'pseudo_F.npy') p_file = os.path.join(cwd, 'significance_p.npy') np.save(F_file, F_set) np.save(p_file, p_set) return F_file, p_file, voxel_range
def create_cwas_batches(mask_file, batches): mask = nb.load(mask_file).get_fdata().astype('bool') voxels = mask.sum(dtype=int) return np.array_split(np.arange(voxels), batches) def volumize(mask_image, data): mask_data = mask_image.get_fdata().astype('bool') volume = np.zeros_like(mask_data, dtype=data.dtype) volume[np.where(mask_data == True)] = data return nb.Nifti1Image( volume, header=mask_image.header, affine=mask_image.affine ) def merge_cwas_batches(cwas_batches, mask_file, z_score, permutations): _, _, voxel_range = zip(*cwas_batches) voxels = np.array(np.concatenate(voxel_range)) mask_image = nb.load(mask_file) F_set = np.zeros_like(voxels, dtype=np.float64) p_set = np.zeros_like(voxels, dtype=np.float64) for F_file, p_file, voxel_range in cwas_batches: F_set[voxel_range] = np.load(F_file) p_set[voxel_range] = np.load(p_file) log_p_set = -np.log10(p_set) one_p_set = 1 - p_set F_vol = volumize(mask_image, F_set) p_vol = volumize(mask_image, p_set) log_p_vol = volumize(mask_image, log_p_set) one_p_vol = volumize(mask_image, one_p_set) cwd = os.getcwd() F_file = os.path.join(cwd, 'pseudo_F_volume.nii.gz') p_file = os.path.join(cwd, 'p_significance_volume.nii.gz') log_p_file = os.path.join(cwd, 'neglog_p_significance_volume.nii.gz') one_p_file = os.path.join(cwd, 'one_minus_p_values.nii.gz') F_vol.to_filename(F_file) p_vol.to_filename(p_file) log_p_vol.to_filename(log_p_file) one_p_vol.to_filename(one_p_file) if 1 in z_score: zvals = pval_to_zval(p_set, permutations) z_file = zstat_image(zvals, mask_file) else: z_file = None return F_file, p_file, log_p_file, one_p_file, z_file def zstat_image(zvals, mask_file): mask_image = nb.load(mask_file) z_vol = volumize(mask_image, zvals) cwd = os.getcwd() z_file = os.path.join(cwd, 'zstat.nii.gz') z_vol.to_filename(z_file) return z_file