Source code for CPAC.utils.create_fsl_flame_preset


# TODO: create a function that can help easily map raw pheno files that do not
# TODO: have the participant_session id that CPAC uses


[docs]def read_group_list_text_file(group_list_text_file):
    """Read in the group-level analysis participant-session list text file."""

    with open(group_list_text_file, "r") as f:
        group_list = f.readlines()

    # each item here includes both participant and session, and this also will
    # become the main ID column in the written design matrix CSV
    group_list = [str(x).rstrip("\n") for x in group_list if x != ""]

    return group_list


[docs]def read_pheno_csv_into_df(pheno_csv, id_label=None):
    """Read the phenotypic file CSV or TSV into a Pandas DataFrame."""

    import pandas as pd

    with open(pheno_csv, "r") as f:
        if id_label:
            if '.tsv' in pheno_csv or '.TSV' in pheno_csv:
                pheno_df = pd.read_table(f, dtype={id_label: object})
            else:
                pheno_df = pd.read_csv(f, dtype={id_label: object})
        else:
            if '.tsv' in pheno_csv or '.TSV' in pheno_csv:
                pheno_df = pd.read_table(f)
            else:
                pheno_df = pd.read_csv(f)

    return pheno_df


[docs]def write_group_list_text_file(group_list, out_file=None):
    """Write out the group-level analysis participant list as a text file."""

    import os

    # prevent duplicates - depending on how the design matrix is set up, we
    # might have multiples of the sub_ses_ID's, like if we're doing repeated
    # measures with series/scans
    new_group_list = []
    for sub_ses_id in group_list:
        if sub_ses_id not in new_group_list:
            new_group_list.append(sub_ses_id)

    if not out_file:
        out_file = os.path.join(os.getcwd(), "group_analysis_participant_"
                                             "list.txt")
    else:
        out_file = os.path.abspath(out_file)
        dir_path = out_file.split(os.path.basename(out_file))[0]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    with open(out_file, "wt") as f:
        for part_id in new_group_list:
            f.write("{0}\n".format(part_id))

    if os.path.exists(out_file):
        print("Group-level analysis participant list written:" \
              "\n{0}\n".format(out_file))

    return out_file


[docs]def write_dataframe_to_csv(matrix_df, out_file=None):
    """Write out a matrix Pandas DataFrame into a CSV file."""

    import os

    if not out_file:
        out_file = os.path.join(os.getcwd(), "matrix.csv")
    else:
        out_file = os.path.abspath(out_file)
        dir_path = out_file.split(os.path.basename(out_file))[0]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    try:
        matrix_df = matrix_df.drop(labels='participant_session_id', axis=1)
    except ValueError:
        pass
    except KeyError:
        pass

    matrix_df.to_csv(out_file, index=False)

    if os.path.exists(out_file):
        print("CSV file written:\n{0}\n".format(out_file))


[docs]def write_config_dct_to_yaml(config_dct, out_file=None):
    """Write out a configuration dictionary into a YAML file."""

    import os
    import CPAC

    if not out_file:
        out_file = os.path.join(os.getcwd(), "group_config.yml")
    else:
        out_file = os.path.abspath(out_file)
        dir_path = out_file.split(os.path.basename(out_file))[0]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    if not out_file.endswith(".yml"):
        out_file = "{0}.yml".format(out_file)

    field_order = ['pipeline_dir', 'participant_list', 'output_dir', 'work_dir',
                   'log_dir', 'FSLDIR', 'run_fsl_feat', 'num_models_at_once',
                   'model_name', 'preset', 'pheno_file', 'ev_selections',
                   'participant_id_label', 'design_formula', 'mean_mask',
                   'custom_roi_mask', 'derivative_list', 'coding_scheme',
                   'group_sep', 'grouping_var', 'z_threshold', 'p_threshold',
                   'sessions_list', 'series_list', 'contrasts', 'f_tests',
                   'custom_contrasts', 'run_randomise', 'randomise_permutation',
                   'randomise_thresh', 'randomise_demean', 'randomise_tfce']

    with open(out_file, "wt") as f:
        f.write("# CPAC Group-Level Analysis Configuration File\n"
                "# Version {0}\n".format(CPAC.__version__))
        f.write("#\n# http://fcp-indi.github.io for more info.\n#\n"
                "# Tip: This file can be edited manually with "
                "a text editor for quick modifications.\n\n\n")
        f.write("# General Group-Level Analysis Settings\n"
                "##############################################################"
                "################\n\n")
        for key in field_order:
            val = config_dct[key]
            f.write("{0}: {1}\n\n".format(key, val))

            if key == 'FSLDIR':
                f.write("\n# FSL-FEAT\n########################################"
                        "######################################\n\n")
            if key == 'custom_contrasts':
                f.write("\n# FSL-Randomise\n###################################"
                        "###########################################\n\n")

    if os.path.exists(out_file):
        print("Group-level analysis configuration YAML file written:\n" \
              "{0}\n".format(out_file))


[docs]def create_design_matrix_df(group_list, pheno_df=None,
                            ev_selections=None, pheno_sub_label=None,
                            pheno_ses_label=None, pheno_site_label=None,
                            ses_id=False):
    """Create the design matrix intended for group-level analysis via the FSL
    FLAME tool.

    This does NOT create the final .mat file that FSL FLAME takes in. This is
    an intermediary design matrix CSV meant for the user to review.

    If there is a phenotype CSV provided, this function will align the
    participant-session ID labels in the CPAC individual-level analysis output
    directory with the values listed in the phenotype file.
    """

    import pandas as pd

    keep_cols = ['participant_id']

    if ses_id:
        # if the group_list is participant_session_id instead of participant_id
        map_df = pd.DataFrame({'participant_session_id': group_list})
        keep_cols += ['participant_session_id']
        part_ids = []
        sess_ids = []
        for part_ses in group_list:
            part = part_ses.split('_')[0]
            sess = part_ses.split('_')[1]
            part_ids.append(part)
            sess_ids.append(sess)
        map_df['participant_id'] = part_ids
        map_df['session'] = sess_ids
        map_df = map_df.sort_values(by=['session', 'participant_id'])
    else:
        map_df = pd.DataFrame({'participant_id': group_list})

    if pheno_df is None:
        # no phenotypic matrix provided; simpler design models
        design_df = map_df[keep_cols]

    else:
        # if a phenotype CSV file is provided with the data
        pheno_df = pheno_df.drop_duplicates()

        # replace spaces and dashes with underscores, to prevent confusion with
        # the Patsy design formula
        rename_pheno_cols = {}
        for col_name in pheno_df.columns:
            if ' ' in col_name or '-' in col_name:
                rename_pheno_cols.update({col_name: col_name.replace(' ', '_').replace('-', '_')})
        pheno_df = pheno_df.rename(columns=rename_pheno_cols)

        # align the pheno's participant ID column with the group sublist text
        # file
        if not pheno_sub_label:
            # TODO: exception message
            raise Exception("there's a pheno file, but no pheno sub label")
        else:
            # rename the pheno sub label thingy
            pheno_df = pheno_df.rename(
                columns={pheno_sub_label: 'participant_id'})
            if ev_selections:
                ev_selections.insert(0, 'participant_id')
            sort_by = ['participant_id']

            if pheno_ses_label:
                # if sessions are important in the model, do this also
                pheno_df = pheno_df.rename(
                    columns={pheno_ses_label: 'session_id'})
                if ev_selections:
                    ev_selections.append(1, 'session_id')
                # again, sort by session ID first in case of repeated
                # measures, where the sessions have to be all together first
                sort_by.insert(0, 'session_id')

            if pheno_site_label:
                # and if sites are important as well, same here
                pheno_df = pheno_df.rename(
                    columns={pheno_site_label: 'site_id'})
                if ev_selections:
                    ev_selections.append(2, 'site_id')

            if ev_selections:
                # get specific covariates!
                pheno_df = pheno_df[ev_selections]

            # check for inconsistency with leading zeroes
            # (sometimes, the sub_ids from individual will be something like
            #  '0002601' and the phenotype will have '2601')
            sublist_subs = map_df['participant_id']
            pheno_subs = list(pheno_df['participant_id'])

            for index, row in pheno_df.iterrows():
                pheno_sub_id = str(row['participant_id'])
                for sub_id in sublist_subs:
                    if str(sub_id).lstrip('0') == pheno_sub_id:
                        pheno_df.at[index, 'participant_id'] = sub_id

            for sub in sublist_subs:
                if sub in pheno_subs:
                    # okay, there's at least one match
                    break
            else:
                new_sublist_subs = [str(x).lstrip('0') for x in sublist_subs]
                for sub in new_sublist_subs:
                    if sub in pheno_subs:
                        # that's better
                        map_df['participant_id'] = new_sublist_subs
                        break
                else:
                    raise Exception('the participant IDs in your group '
                                    'analysis participant list and the '
                                    'participant IDs in your phenotype file '
                                    'do not match')

            # merge
            if pheno_ses_label:
                design_df = pheno_df.merge(map_df, on=['participant_id'])
            else:
                design_df = pheno_df.merge(map_df[['participant_id']],
                                           on='participant_id')

            design_df = design_df.sort_values(sort_by)


    return design_df


[docs]def create_contrasts_template_df(design_df, contrasts_dct_list=None):
    """Create the template Pandas DataFrame for the contrasts matrix CSV.

    The headers in the contrasts matrix needs to match the headers of the
    design matrix."""

    import pandas as pd

    contrast_cols = list(design_df.columns)
    contrast_cols.remove('participant_id')

    if contrasts_dct_list:
        # if we are initializing the contrasts matrix with pre-set contrast
        # vectors - just check for accuracy here
        for contrast_dct in contrasts_dct_list:
            # contrast_dct is a dictionary with each column name mapped to its
            # contrast vector value, like this:
            #     {contrast: "Group Mean", "Group Mean": 1, "age": 0}
            if (len(contrast_dct) - 1) != len(contrast_cols):
                # it's -1 because of the "contrast" column in contrast_dct
                # TODO: message
                raise Exception("number of columns in the contrast vector "
                                "does not match the number of covariate "
                                "columns in the design matrix")

    else:
        # if default, start it up with a blank "template" contrast vector
        contrast_one = {"Contrasts": "contrast_1"}
        contrast_two = {"Contrasts": "contrast_2"}

        for col in contrast_cols:
            contrast_one.update({col: 0})
            contrast_two.update({col: 0})

        contrasts_dct_list = [contrast_one, contrast_two]

    contrast_cols.insert(0, "Contrasts")

    # now, make the actual dataframe
    contrasts_df = pd.DataFrame(contrasts_dct_list)

    # order the columns properly
    contrasts_df = contrasts_df[contrast_cols]

    return contrasts_df


[docs]def preset_single_group_avg(group_list, pheno_df=None, covariate=None,
                            pheno_sub_label=None, output_dir=None,
                            model_name="one_sample_T-test"):
    """Set up the design matrix CSV for running a single group average
    (one-sample T-test)."""

    import os

    if not output_dir:
        output_dir = os.getcwd()

    id_cols = ["participant_id", "session_id", "site_id"]

    # change spaces and dashes to underscores to prevent confusion with the
    # Patsy design formula
    if covariate:
        covariate = covariate.lstrip(' ').rstrip(' ')
        covariate = covariate.replace(' ', '_').replace('-', '_')

    ev_selections = None
    if pheno_df is not None:
        if covariate and pheno_sub_label:
            # if we're adding an additional covariate
            ev_selections = [covariate]

    design_df = create_design_matrix_df(group_list, pheno_df,
                                        ev_selections=ev_selections,
                                        pheno_sub_label=pheno_sub_label)

    design_df["Group_Mean"] = 1

    group_mean_contrast = {"Contrasts": "Group Mean"}

    # make these loops in case we expand this to handle more than one
    # covariate past the Group Mean
    for col in design_df.columns:
        if col not in id_cols:
            if col == "Group_Mean":
                group_mean_contrast.update({col: 1})
            else:
                group_mean_contrast.update({col: 0})

    contrasts = [group_mean_contrast]

    if covariate:
        covariate_contrast = {"Contrasts": covariate}

        for col in design_df.columns:
            if col not in id_cols:
                if col == covariate:
                    covariate_contrast.update({col: 1})
                else:
                    covariate_contrast.update({col: 0})

        contrasts.append(covariate_contrast)

    contrasts_df = create_contrasts_template_df(design_df, contrasts)

    # create design and contrasts matrix file paths
    design_mat_path = os.path.join(output_dir, model_name,
                                   "design_matrix_{0}.csv".format(model_name))

    contrasts_mat_path = os.path.join(output_dir, model_name,
                                      "contrasts_matrix_{0}.csv"
                                      "".format(model_name))

    # start group config yaml dictionary
    design_formula = "Group_Mean"
    if covariate:
        design_formula = "{0} + {1}".format(design_formula, covariate)

    group_config = {"pheno_file": design_mat_path,
                    "ev_selections": {"demean": [str(covariate)],
                                      "categorical": ["Group_Mean"]},
                    "design_formula": design_formula,
                    "group_sep": "Off",
                    "grouping_var": None,
                    "sessions_list": [],
                    "series_list": [],
                    "custom_contrasts": contrasts_mat_path,
                    "model_name": model_name,
                    "output_dir": os.path.join(output_dir, model_name),
                    "work_dir": os.path.join(output_dir, model_name),
                    "log_dir": os.path.join(output_dir, model_name)}

    return design_df, contrasts_df, group_config


[docs]def preset_unpaired_two_group(group_list, pheno_df, groups, pheno_sub_label,
                              output_dir=None,
                              model_name="two_sample_unpaired_T-test"):
    """Set up the design matrix and contrasts matrix for running an unpaired
    two-group difference (two-sample unpaired T-test).

    group_list: a list of strings- sub_ses unique IDs
    pheno_df: a Pandas DataFrame object of the phenotypic file CSV/matrix
    groups: a list of either one or two strings- design matrix EV/covariate
            labels to take from the phenotype DF and include in the model
    pheno_sub_label: a string of the label name of the column in the phenotype
                     file that holds the participant/session ID for each row
    output_dir: (optional) string of the output directory path
    model_name: (optional) name/label of the model to run

    Sets up the model described here:
        https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/FEAT/UserGuide
            #Unpaired_Two-Group_Difference_.28Two-Sample_Unpaired_T-Test.29

    Only one "group" will be provided usually if the two groups in the
    phenotypic information you wish to compare are encoded in one covariate
    column, as categorical information. Thus, providing this one name will
    pull it from the phenotype file, and this function will break it out into
    two columns using dummy-coding.
    """

    import os

    if not output_dir:
        output_dir = os.getcwd()

    id_cols = ["participant_id", "session_id", "site_id"]

    # change spaces and dashes to underscores to prevent confusion with the
    # Patsy design formula
    old_groups = groups
    groups = []
    for group in old_groups:
        group = group.lstrip(' ').rstrip(' ')
        group = group.replace(' ', '_').replace('-', '_')
        groups.append(group)

    # if the two groups are encoded in one categorical EV/column, then we will
    # have to dummy code them out
    #     if this is the case, then "groups" will be a list with only one
    #     element in it- the one EV/column that is to be split up into two
    ev_selections = []
    for group in groups:
        ev_selections.append(group)

    design_df = create_design_matrix_df(group_list, pheno_df,
                                        ev_selections=ev_selections,
                                        pheno_sub_label=pheno_sub_label)

    if len(groups) == 1:
        # we're going to split the one categorical EV into two
        new_groups = []

        # get full range of values in one-column categorical EV
        group_set = list(set(design_df[groups[0]]))

        # run this again!
        # change spaces and dashes to underscores to prevent confusion with the
        # Patsy design formula
        new_group_set = []
        for group in group_set:
            group = group.lstrip(' ').rstrip(' ')
            group = group.replace(' ', '_').replace('-', '_')
            new_group_set.append(group)

        # this preset is for an unpaired two-group difference- should only be
        # two groups encoded in this EV!
        if len(group_set) > 2:
            # TODO: message
            raise Exception("more than two groups provided, but this is a"
                            "model for a two-group difference\n\ngroups "
                            "found in column:\n{0}".format(str(group_set)))
        elif len(group_set) == 0:
            raise Exception("no groups were found - something went wrong "
                            "with reading the phenotype information")
        elif len(group_set) == 1:
            raise Exception("only one group found in the column provided, "
                            "but this is a model for a two-group difference"
                            "\n\ngroups found in column:\n"
                            "{0}".format(str(group_set)))

        # create the two new dummy-coded columns
        # column 1
        # new column name
        new_name = "{0}_{1}".format(groups[0], new_group_set[0])
        # create new column encoded in 0's
        design_df[new_name] = 0
        # map the relevant values into 1's
        design_df[new_name] = design_df[groups[0]].map({group_set[0]: 1,
                                                        group_set[1]: 0})
        # update groups list
        new_groups.append(new_name)

        # column 2
        # new column name
        new_name = "{0}_{1}".format(groups[0], new_group_set[1])
        # create new column encoded in 0's
        design_df[new_name] = 0
        # map the relevant values into 1's
        design_df[new_name] = design_df[groups[0]].map({group_set[1]: 1,
                                                        group_set[0]: 0})
        # update groups list
        new_groups.append(new_name)

        # drop original EV/column
        del design_df[groups[0]]

        # update groups list
        groups = new_groups

    # start the contrasts
    contrast_one = {"Contrasts": "{0} - {1}".format(groups[0], groups[1])}
    contrast_two = {"Contrasts": "{0} - {1}".format(groups[1], groups[0])}

    # make these loops in case we expand this to handle additional covariates
    # past the "prescribed" ones in the model/preset
    for col in design_df.columns:
        if col not in id_cols:
            if col == groups[0]:
                contrast_one.update({col: 1})
                contrast_two.update({col: -1})
            elif col == groups[1]:
                contrast_one.update({col: -1})
                contrast_two.update({col: 1})
            else:
                contrast_one.update({col: 0})
                contrast_two.update({col: 0})

    contrasts = [contrast_one, contrast_two]

    contrasts_df = create_contrasts_template_df(design_df, contrasts)

    # create design and contrasts matrix file paths
    design_mat_path = os.path.join(output_dir, model_name,
                                   "design_matrix_{0}.csv".format(model_name))

    contrasts_mat_path = os.path.join(output_dir, model_name,
                                      "contrasts_matrix_{0}.csv"
                                      "".format(model_name))

    # start group config yaml dictionary
    design_formula = "{0} + {1}".format(groups[0], groups[1])

    group_config = {"pheno_file": design_mat_path,
                    "ev_selections": {"demean": [],
                                      "categorical": str(groups)},
                    "design_formula": design_formula,
                    "group_sep": "On",
                    "grouping_var": str(groups),
                    "sessions_list": [],
                    "series_list": [],
                    "custom_contrasts": contrasts_mat_path,
                    "model_name": model_name,
                    "output_dir": os.path.join(output_dir, model_name),
                    "work_dir": os.path.join(output_dir, model_name),
                    "log_dir": os.path.join(output_dir, model_name)}

    return design_df, contrasts_df, group_config


[docs]def preset_paired_two_group(group_list, conditions, condition_type="session",
                            output_dir=None,
                            model_name="two_sample_unpaired_T-test"):
    """Set up the design matrix and contrasts matrix for running an paired
    two-group difference (two-sample paired T-test).

    group_list: a list of strings- sub_ses unique IDs
    conditions: a two-item list of strings- session or series/scan names of
                the two sessions or two scans (per participant) you wish to
                compare
    condition_type: a string, either "session" or "scan", depending on what
                    is in "conditions"
    output_dir: (optional) string of the output directory path
    model_name: (optional) name/label of the model to run

    Sets up the model described here:
        https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/FEAT/UserGuide
            #Paired_Two-Group_Difference_.28Two-Sample_Paired_T-Test.29
    """

    import os

    if not output_dir:
        output_dir = os.getcwd()

    if len(conditions) != 2:
        # TODO: msg
        raise Exception

    sess_conditions = ["session", "Session", "sessions", "Sessions"]
    scan_conditions = ["scan", "scans", "series", "Series/Scans", "Series"]

    sesflag = False
    if condition_type in sess_conditions:
        sesflag = True

    design_df = create_design_matrix_df(group_list, ses_id=sesflag)

    # make the "condition" EV (the 1's and -1's delineating the two
    # conditions, with the "conditions" being the two sessions or two scans)
    condition_ev = []

    if condition_type in sess_conditions:
        # note: the participant_id column in design_df should be in order, so
        #       the condition_ev should come out in order:
        #           1,1,1,1,2,2,2,2 (if the sessions are 1 and 2)
        #       later on, this has to be converted to 1,1,1,1,-1,-1,-1,-1 !
        for sub_ses_id in design_df["participant_session_id"]:
            if sub_ses_id.split("_")[-1] == conditions[0]:
                condition_ev.append(conditions[0])
            elif sub_ses_id.split("_")[-1] == conditions[1]:
                condition_ev.append(conditions[1])

        group_config = {"sessions_list": conditions, "series_list": []}

    elif condition_type in scan_conditions:
        # TODO: re-visit later, when session/scan difference in how to run
        # TODO: group-level analysis repeated measures is streamlined and
        # TODO: simplified
        # the information needed in this part is not encoded in the group
        # sublist! user inputs the two scan names, and we have a list of
        # sub_ses (which needs to be doubled), with each scan paired to each
        # half of this list (will need to ensure these scans exist for each
        # selected derivative in the output directory later on)

        # the condition_ev should come out in order:
        #     scan-1,scan-1,scan-1,scan-2,scan-2,scan-2
        #         (if the scans are scan-1 and scan-2, etc.)
        # later on, this has to be converted to 1,1,1,-1,-1,-1 !
        for sub_ses_id in design_df["participant_id"]:
            condition_ev.append(conditions[0])
        for sub_ses_id in design_df["participant_id"]:
            condition_ev.append(conditions[1])

        # NOTE: there is only one iteration of the sub_ses list in
        #       design_df["participant_id"] at this point! so use append to
        #       double that column:
        design_df = design_df.append(design_df)

        group_config = {"sessions_list": [], "series_list": conditions}

    else:
        # TODO: msg
        raise Exception

    # let's check to make sure it came out right
    #   first half
    past_val = None
    for val in condition_ev[0:(len(condition_ev) / 2) - 1]:
        if past_val:
            if val != past_val:
                raise Exception('Non-equal amount of participants for each '
                                '{0}.\n'.format(condition_type))
        past_val = val
    #   second half
    past_val = None
    for val in condition_ev[(len(condition_ev) / 2):]:
        if past_val:
            if val != past_val:
                raise Exception('Non-equal amount of participants for each '
                                '{0}.\n'.format(condition_type))
        past_val = val

    design_df[condition_type] = condition_ev

    # initalize the contrast dct's
    contrast_one = {}
    contrast_two = {}

    design_formula = "{0}".format(condition_type)

    # create the participant identity columns
    for sub_ses_id in design_df["participant_id"]:
        new_part_col = []
        sub_id = sub_ses_id.split("_")[0]
        new_part_label = "participant_{0}".format(sub_id)
        for moving_sub_ses_id in design_df["participant_id"]:
            moving_sub_id = moving_sub_ses_id.split("_")[0]
            if moving_sub_id == sub_id:
                new_part_col.append(1)
            else:
                new_part_col.append(0)
        design_df[new_part_label] = new_part_col
        contrast_one.update({new_part_label: 0})
        contrast_two.update({new_part_label: 0})
        if new_part_label not in design_formula:
            design_formula = "{0} + {1}".format(design_formula,
                                                new_part_label)

    # finish the contrasts
    #   should be something like
    #                    ses,sub,sub,sub, etc.
    #     ses-1 - ses-2:   1,  0,  0,  0, 0...
    #     ses-2 - ses-1:  -1,  0,  0,  0, etc.
    contrast_one.update({
        "Contrasts": "{0}_{1} - {2}_{3}".format(condition_type,
                                                conditions[0],
                                                condition_type,
                                                conditions[1])})
    contrast_two.update({
        "Contrasts": "{0}_{1} - {2}_{3}".format(condition_type,
                                                conditions[1],
                                                condition_type,
                                                conditions[0])})

    contrast_one.update({condition_type: 1})
    contrast_two.update({condition_type: -1})

    try:
        design_df = design_df.drop(labels=['participant_session_id'],
                                   axis='columns')
    except KeyError:
        pass

    contrasts = [contrast_one, contrast_two]
    contrasts_df = create_contrasts_template_df(design_df, contrasts)

    # create design and contrasts matrix file paths
    design_mat_path = os.path.join(output_dir, model_name,
                                   "design_matrix_{0}.csv".format(model_name))

    contrasts_mat_path = os.path.join(output_dir, model_name,
                                      "contrasts_matrix_{0}.csv"
                                      "".format(model_name))

    # start group config yaml dictionary
    group_config.update({"pheno_file": design_mat_path,
                         "ev_selections": {"demean": [],
                                           "categorical": []},
                         "design_formula": design_formula,
                         "group_sep": "Off",
                         "grouping_var": None,
                         "custom_contrasts": contrasts_mat_path,
                         "model_name": model_name,
                         "output_dir": os.path.join(output_dir, model_name),
                         "work_dir": os.path.join(output_dir, model_name),
                         "log_dir": os.path.join(output_dir, model_name)})

    return design_df, contrasts_df, group_config


[docs]def preset_tripled_two_group(group_list, conditions, condition_type="Sessions",
                             output_dir=None,
                             model_name="tripled_T-test"):
    """Set up the design matrix and contrasts matrix for running a tripled
    two-group difference ('tripled' T-test).

    group_list: a list of strings- sub_ses unique IDs
    conditions: a three-item list of strings- session or series/scan names of
                the three sessions or three scans (per participant) you wish
                to compare
    condition_type: a string, either "session" or "scan", depending on what
                    is in "conditions"
    output_dir: (optional) string of the output directory path
    model_name: (optional) name/label of the model to run

    Sets up the model described here:
        https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/FEAT/UserGuide
            #Tripled_Two-Group_Difference_.28.22Tripled.22_T-Test.29
    """

    import os

    if not output_dir:
        output_dir = os.getcwd()

    if len(conditions) != 3:
        # TODO: msg
        raise Exception('Three conditions are required for the tripled '
                        't-test.\n')

    sess_conditions = ["session", "Session", "sessions", "Sessions"]
    scan_conditions = ["scan", "scans", "series", "Series/Scans", "Series"]

    sesflag = False
    if condition_type in sess_conditions:
        sesflag = True

    design_df = create_design_matrix_df(group_list, ses_id=sesflag)

    # make the "condition" EVs (the 1's, -1's, and 0's delineating the three
    # conditions, with the "conditions" being the three sessions or three
    # scans)
    condition_ev_one = []
    condition_ev_two = []

    if condition_type in sess_conditions:
        # note: the participant_id column in design_df should be in order, so
        #       the condition_ev's should come out in order:
        #           1,1,1,-1,-1,-1, 0, 0, 0  (this is checked further down)
        #           1,1,1, 0, 0, 0,-1,-1,-1
        for sub_ses_id in design_df["participant_session_id"]:
            if sub_ses_id.split("_")[-1] == conditions[0]:
                condition_ev_one.append(1)
                condition_ev_two.append(1)
            elif sub_ses_id.split("_")[-1] == conditions[1]:
                condition_ev_one.append(-1)
                condition_ev_two.append(0)
            elif sub_ses_id.split("_")[-1] == conditions[2]:
                condition_ev_one.append(0)
                condition_ev_two.append(-1)

        group_config = {"sessions_list": conditions, "series_list": []}

    elif condition_type in scan_conditions:
        # TODO: re-visit later, when session/scan difference in how to run
        # TODO: group-level analysis repeated measures is streamlined and
        # TODO: simplified
        # the information needed in this part is not encoded in the group
        # sublist! user inputs the two scan names, and we have a list of
        # sub_ses (which needs to be doubled), with each scan paired to each
        # half of this list (will need to ensure these scans exist for each
        # selected derivative in the output directory later on)

        for sub_ses_id in design_df["participant_id"]:
            condition_ev_one.append(1)
            condition_ev_two.append(1)
        for sub_ses_id in design_df["participant_id"]:
            condition_ev_one.append(-1)
            condition_ev_two.append(0)
        for sub_ses_id in design_df["participant_id"]:
            condition_ev_one.append(0)
            condition_ev_two.append(-1)

        # NOTE: there is only one iteration of the sub_ses list in
        #       design_df["participant_id"] at this point! so use append
        #       (twice) triple that column:
        design_df_double = design_df.append(design_df)
        design_df = design_df_double.append(design_df)

        group_config = {"sessions_list": [], "series_list": conditions}

    else:
        # TODO: msg
        raise Exception

    # let's check to make sure it came out right
    #   first third
    for val in condition_ev_one[0:(len(condition_ev_one) / 3) - 1]:
        if val != 1:
            # TODO: msg
            raise Exception
    #   second third
    for val in condition_ev_one[(len(condition_ev_one) / 3):(len(condition_ev_one)/3)*2]:
        if val != -1:
            # TODO: msg
            raise Exception
    #   third... third
    for val in condition_ev_one[((len(condition_ev_one)/3)*2 + 1):]:
        if val != 0:
            # TODO: msg
            raise Exception
    #   first third
    for val in condition_ev_two[0:(len(condition_ev_two) / 3) - 1]:
        if val != 1:
            # TODO: msg
            raise Exception
    #   second third
    for val in condition_ev_two[(len(condition_ev_two) / 3):(len(condition_ev_two)/3)*2]:
        if val != 0:
            # TODO: msg
            raise Exception
    #   third... third
    for val in condition_ev_two[((len(condition_ev_two)/3)*2 + 1):]:
        if val != -1:
            # TODO: msg
            raise Exception

    # label the two covariate columns which encode the three conditions
    column_one = "{0}_column_one".format(condition_type)
    column_two = "{0}_column_two".format(condition_type)

    design_df[column_one] = condition_ev_one
    design_df[column_two] = condition_ev_two

    # initalize the contrast dct's
    contrast_one = {}
    contrast_two = {}
    contrast_three = {}

    design_formula = "{0} + {1}".format(column_one, column_two)

    # create the participant identity columns
    for sub_id in design_df["participant_id"]:
        new_part_col = []
        new_part_label = "participant_{0}".format(sub_id)
        for moving_sub_ses_id in design_df["participant_id"]:
            moving_sub_id = moving_sub_ses_id.split("_")[0]
            if moving_sub_id == sub_id:
                new_part_col.append(1)
            else:
                new_part_col.append(0)
        design_df[new_part_label] = new_part_col
        contrast_one.update({new_part_label: 0})
        contrast_two.update({new_part_label: 0})
        contrast_three.update({new_part_label: 0})
        if new_part_label not in design_formula:
            design_formula = "{0} + {1}".format(design_formula,
                                                new_part_label)

    # finish the contrasts
    #   should be something like
    #                    ses,ses,sub,sub,sub, etc.
    #     ses-1 - ses-2:   2,  1,  0,  0,  0...
    #     ses-1 - ses-3:   1,  2,  0,  0,  0...
    #     ses-2 - ses-3:  -1,  1,  0,  0,  0, etc.
    contrast_one.update({
        "Contrasts": "{0}_{1} - {2}_{3}".format(condition_type,
                                                conditions[0],
                                                condition_type,
                                                conditions[1])})
    contrast_two.update({
        "Contrasts": "{0}_{1} - {2}_{3}".format(condition_type,
                                                conditions[0],
                                                condition_type,
                                                conditions[2])})

    contrast_three.update({
        "Contrasts": "{0}_{1} - {2}_{3}".format(condition_type,
                                                conditions[1],
                                                condition_type,
                                                conditions[2])})

    contrast_one.update({column_one: 2, column_two: 1})
    contrast_two.update({column_one: 1, column_two: 2})
    contrast_three.update({column_one: -1, column_two: 1})

    try:
        design_df = design_df.drop(labels=['participant_session_id'],
                                   axis='columns')
    except KeyError:
        pass

    contrasts = [contrast_one, contrast_two, contrast_three]
    contrasts_df = create_contrasts_template_df(design_df, contrasts)

    # create design and contrasts matrix file paths
    design_mat_path = os.path.join(output_dir, model_name,
                                   "design_matrix_{0}.csv".format(model_name))

    contrasts_mat_path = os.path.join(output_dir, model_name,
                                      "contrasts_matrix_{0}.csv"
                                      "".format(model_name))

    # start group config yaml dictionary
    group_config.update({"pheno_file": design_mat_path,
                         "ev_selections": {"demean": [],
                                           "categorical": []},
                         "design_formula": design_formula,
                         "group_sep": "Off",
                         "grouping_var": None,
                         "custom_contrasts": contrasts_mat_path,
                         "model_name": model_name,
                         "output_dir": os.path.join(output_dir, model_name),
                         "work_dir": os.path.join(output_dir, model_name),
                         "log_dir": os.path.join(output_dir, model_name)})

    return design_df, contrasts_df, group_config


[docs]def run(pipeline_dir, derivative_list, z_thresh, p_thresh, preset=None,
        group_list_text_file=None, pheno_file=None, pheno_sub_label=None,
        output_dir=None, model_name=None, covariate=None, condition_type=None,
        run=False):

    # FSL FEAT presets: run regular group analysis with no changes to its
    # original flow- use the generated pheno as the pheno, use the
    # contrasts DF as a custom contrasts matrix, and auto-generate the
    # group analysis config YAML as well

    # NOTE: the input parameters above may come in as a dictionary instead
    #       or something

    import os
    import pandas as pd
    import pkg_resources as p

    # make life easy
    keys_csv = p.resource_filename('CPAC', 'resources/cpac_outputs.csv')
    try:
        keys = pd.read_csv(keys_csv)
    except Exception as e:
        err = "\n[!] Could not access or read the cpac_outputs.csv " \
              "resource file:\n{0}\n\nError details {1}\n".format(keys_csv, e)
        raise Exception(err)

    if derivative_list == 'all':
        derivative_list = ['alff', 'falff', 'reho', 'sca_roi', 'sca_tempreg',
                           'vmhc', 'centrality', 'dr_tempreg']

    if pheno_file and not pheno_sub_label:
        # TODO: message
        raise Exception("pheno file provided, but no pheno sub label")

    if pheno_sub_label and not pheno_file:
        # TODO: message
        raise Exception("pheno sub label provided, but no pheno file")

    try:
        if "None" in group_list_text_file or "none" in group_list_text_file:
            group_list_text_file = None
    except TypeError:
        pass

    if not group_list_text_file:
        from CPAC.pipeline.cpac_group_runner import grab_pipeline_dir_subs
        if (preset == "paired_two" or preset == "tripled_two") and "Sessions" in condition_type:
            group_list = grab_pipeline_dir_subs(pipeline_dir, True)
        else:
            group_list = grab_pipeline_dir_subs(pipeline_dir)
        group_list_text_file = os.path.join(output_dir, model_name,
                                            "group_participant_list_"
                                            "{0}.txt".format(model_name))

    elif isinstance(group_list_text_file, list):
        group_list = group_list_text_file

        # write out a group analysis sublist text file so that it can be
        # linked in the group analysis config yaml
        group_list_text_file = os.path.join(output_dir, model_name,
                                            "group_participant_list_"
                                            "{0}.txt".format(model_name))
    elif os.path.isfile(group_list_text_file):
        group_list = read_group_list_text_file(group_list_text_file)

        # write out a group analysis sublist text file so that it can be
        # linked in the group analysis config yaml
        group_list_text_file = os.path.join(output_dir, model_name,
                                            "group_participant_list_"
                                            "{0}.txt".format(model_name))

    if len(group_list) == 0:
        msg = "\n\n[!] C-PAC says: No participants found in the pipeline " \
              "directory you provided. Make sure the directory is the " \
              "individual-level pipeline directory that contains the sub-" \
              "directories labeled with the participant_session IDs.\n\n" \
              "Pipeline directory provided: {0}\n\n".format(pipeline_dir)
        raise Exception(msg)

    if not preset:
        # TODO: this
        pass

    group_config = {"pipeline_dir": pipeline_dir,
                    "FSLDIR": "FSLDIR",
                    "run_fsl_feat": [1],
                    "num_models_at_once": 1,
                    "preset": preset,
                    "participant_list": group_list_text_file,
                    "participant_id_label": "participant_id",
                    "mean_mask": ["Group Mask"],
                    "custom_roi_mask": None,
                    "derivative_list": derivative_list,
                    "coding_scheme": ["Treatment"],
                    "z_threshold": [float(z_thresh)],
                    "p_threshold": [float(p_thresh)],
                    "contrasts": [],
                    "f_tests": [],
                    "run_randomise": [0],
                    "randomise_permutation": 500,
                    "randomise_thresh": 5,
                    "randomise_demean": True,
                    "randomise_tfce": True}

    if preset == "single_grp":
        design_df, contrasts_df, group_config_update = \
            preset_single_group_avg(group_list, pheno_df=None, covariate=None,
                                    pheno_sub_label=None,
                                    output_dir=output_dir,
                                    model_name=model_name)

        group_config.update(group_config_update)

    elif preset == "single_grp_cov":

        if not pheno_file:
            # TODO: message
            raise Exception("pheno file not provided")

        if not covariate:
            # TODO: message
            raise Exception("covariate not provided")

        pheno_df = read_pheno_csv_into_df(pheno_file, pheno_sub_label)

        design_df, contrasts_df, group_config_update = \
            preset_single_group_avg(group_list, pheno_df, covariate=covariate,
                                    pheno_sub_label=pheno_sub_label,
                                    output_dir=output_dir,
                                    model_name=model_name)

        group_config.update(group_config_update)

    elif preset == "unpaired_two":

        if not pheno_file:
            # TODO: message
            raise Exception("pheno file not provided")

        if not covariate:
            # TODO: message
            raise Exception("the two groups were not provided")

        # we're assuming covariate will be coming in as a string of either one
        # covariate name, or a string with two covariates separated by a comma
        #     either way, it needs to be in list form in this case, not string
        covariate = covariate.split(",")

        pheno_df = read_pheno_csv_into_df(pheno_file, pheno_sub_label)

        # in this case, "covariate" gets sent in as a list of two covariates
        design_df, contrasts_df, group_config_update = \
            preset_unpaired_two_group(group_list, pheno_df,
                                      groups=covariate,
                                      pheno_sub_label=pheno_sub_label,
                                      output_dir=output_dir,
                                      model_name=model_name)

        group_config.update(group_config_update)

    elif preset == "paired_two":
        # run a two-sample paired T-test

        # we need it as repeated measures- either session or scan
        # and the list of subs
        # also: the two session or scan names (in a list together), and
        # whether they are sessions or scans

        if not covariate:
            # TODO: message
            raise Exception("the two conditions were not provided")

        if not condition_type:
            # TODO: message
            raise Exception("you didn't specify whether the two groups are "
                            "sessions or series/scans")

        # we're assuming covariate (which in this case, is the two sessions,
        # or two scans) will be coming in as a string of either one covariate
        # name, or a string with two covariates separated by a comma
        #     either way, it needs to be in list form in this case, not string
        try:
            covariate = covariate.split(",")
        except AttributeError:
            # it's already a list- keep it that way
            pass

        design_df, contrasts_df, group_config_update = \
            preset_paired_two_group(group_list,
                                    conditions=covariate,
                                    condition_type=condition_type,
                                    output_dir=output_dir,
                                    model_name=model_name)

        group_config.update(group_config_update)

    elif preset == "tripled_two":
        # run a "tripled" T-test

        # we need it as repeated measures- either session or scan
        # and the list of subs
        # also: the two session or scan names (in a list together), and
        # whether they are sessions or scans

        if not covariate:
            # TODO: message
            raise Exception("the three conditions were not provided")

        if not condition_type:
            # TODO: message
            raise Exception("you didn't specify whether the three groups are "
                            "sessions or series/scans")

        # we're assuming covariate (which in this case, is the three sessions,
        # or three scans) will be coming in as a string of either one
        # covariate name, or a string with three covariates separated by a
        # comma
        #     either way, it needs to be in list form in this case, not string
        try:
            covariate = covariate.split(",")
        except AttributeError:
            # it's already a list- keep it that way
            pass

        design_df, contrasts_df, group_config_update = \
            preset_tripled_two_group(group_list,
                                     conditions=covariate,
                                     condition_type=condition_type,
                                     output_dir=output_dir,
                                     model_name=model_name)

        group_config.update(group_config_update)

    else:
        # TODO: not a real preset!
        raise Exception("not one of the valid presets")

    # write participant list text file
    write_group_list_text_file(design_df["participant_id"],
                               group_list_text_file)

    # write design matrix CSV
    write_dataframe_to_csv(design_df, group_config["pheno_file"])

    # write custom contrasts matrix CSV
    write_dataframe_to_csv(contrasts_df, group_config["custom_contrasts"])

    # write group-level analysis config YAML
    out_config = os.path.join(output_dir, model_name,
                              "group_config_{0}.yml".format(model_name))
    write_config_dct_to_yaml(group_config, out_config)

    if run:
        # TODO: we need to separate the individual-level pipeline config from
        # TODO: the group-level one, it's too restrictive
        pass