Source code for methylprep.processing.postprocess

# Lib
import numpy as np
import pandas as pd
import os
import pickle
from pathlib import Path
import logging
# app
from ..utils import is_file_like
#from ..utils.progress_bar import * # context tqdm

os.environ['NUMEXPR_MAX_THREADS'] = "8" # suppresses warning


__all__ = ['calculate_beta_value', 'calculate_m_value', 'consolidate_values_for_sheet', 'consolidate_mouse_probes']

LOGGER = logging.getLogger(__name__)

def calculate_beta_value(methylated_noob, unmethylated_noob, offset=100):
    """ the ratio of (methylated_intensity / total_intensity)
    where total_intensity is (meth + unmeth + 100) -- to give a score in range of 0 to 1.0.
    minfi offset is 100 and sesame (default) offset is zero."""
    methylated = np.clip(methylated_noob, 1, None)
    unmethylated = np.clip(unmethylated_noob, 1, None)

    total_intensity = methylated + unmethylated + offset
    with np.errstate(all='raise'):
        intensity_ratio = np.true_divide(methylated, total_intensity)
    return intensity_ratio


def calculate_m_value(methylated_noob, unmethylated_noob, offset=0):
    """ the log(base 2) (1+meth / 1+unmeth) intensities (with a min clip intensity of 1 to avoid divide-by-zero-errors, like sesame)"""
    methylated = np.clip(methylated_noob, 1, None) + offset
    unmethylated = np.clip(unmethylated_noob, 1, None) + offset

    with np.errstate(all='raise'):
        intensity_ratio = np.true_divide(methylated, unmethylated)
    return np.log2(intensity_ratio)


def calculate_copy_number(methylated_noob, unmethylated_noob, offset=None):
    """ the log(base 2) of the combined (meth + unmeth AKA green and red) intensities """
    # Note: offset is a kwarg to match other calculate functions above, but there is no offset used in this function.
    total_intensity = methylated_noob + unmethylated_noob
    copy_number = np.log2(total_intensity)
    return copy_number


[docs]def consolidate_values_for_sheet(data_containers, postprocess_func_colname='beta_value', bit='float32', poobah=True, poobah_sig=0.05, exclude_rs=True):
    """ Transforms results into a single dataframe with all of the function values,
    with probe names in rows, and sample beta values for probes in columns.

    Input:
        data_containers -- the output of run_pipeline() is this, a list of data_containers.
        (a list of processed SampleDataContainer objects)

    Arguments for postprocess_func_colname:
        calculate_beta_value --> 'beta_value'
        calculate_m_value --> 'm_value'
        calculate_copy_number --> 'cm_value'

    note: these functions are hard-coded in pipeline.py as part of process_all() step.
    note: if run_pipeline included 'sesame' option, then quality mask is automatically applied to all pickle outputs, and saved as column in processed CSV.

    Options:
        bit (float16, float32, float64) -- change the default data type from float32
            to another type to save disk space. float16 works fine, but might not be compatible
            with all numnpy/pandas functions, or with outside packages, so float32 is default.
            This is specified from methylprep process command line.
        poobah
            If true, filters by the poobah_pval column. (beta m_val pass True in for this.)
        data_container.quality_mask (True/False)
            If 'quality_mask' is present in df, True filters these probes from pickle output.
        exclude_rs
            as of v1.5.0 SigSet keeps snp ('rs') probes with other probe types (if qualityMask is false); need to separate them here
            before exporting to file."""
    poobah_column = 'poobah_pval'
    quality_mask = 'quality_mask'
    for idx,sample in enumerate(data_containers):
        sample_id = f"{sample.sample.sentrix_id}_{sample.sample.sentrix_position}"

        if poobah == True and poobah_column in sample._SampleDataContainer__data_frame.columns:
            # remove all failed probes by replacing with NaN before building DF.
            sample._SampleDataContainer__data_frame.loc[sample._SampleDataContainer__data_frame[poobah_column] >= poobah_sig, postprocess_func_colname] = np.nan
        elif poobah == True and poobah_column not in sample._SampleDataContainer__data_frame.columns:
            LOGGER.warning('DEBUG: missing poobah')

        if sample.quality_mask == True and quality_mask in sample._SampleDataContainer__data_frame.columns:
            # blank there probes where quality_mask == 0
            sample._SampleDataContainer__data_frame.loc[sample._SampleDataContainer__data_frame[quality_mask] == 0, postprocess_func_colname] = np.nan

        this_sample_values = sample._SampleDataContainer__data_frame[postprocess_func_colname]

        if exclude_rs: # dropping rows before exporting
            mask_snps = (sample._SampleDataContainer__data_frame.index.str.startswith('rs'))
            this_sample_values = this_sample_values.loc[ ~mask_snps ]

        if idx == 0:
            merged = pd.DataFrame(this_sample_values, columns=[postprocess_func_colname])
            merged.rename(columns={postprocess_func_colname: sample_id}, inplace=True)
            continue
        merged = pd.concat([merged, this_sample_values], axis=1)
        merged.rename(columns={postprocess_func_colname: sample_id}, inplace=True)
    if bit != 'float32' and bit in ('float64','float16'):
        merged = merged.astype(bit)
    return merged


def one_sample_control_snp(container):
    """Creates the control_probes.pkl dataframe for export
Unlike all the other postprocessing functions, this uses a lot of SampleDataContainer objects that get removed to save memory,
so calling this immediately after preprocessing a sample so whole batches of data are not kept in memory.

Input: one SampleDataContainer object.

Returns:
    a pickled dataframe with non-CpG probe values.
    Control: red and green intensity
    SNP: beta values, based on uncorrected meth and unmeth intensity.
    Where
        0-0.25 ~~ homogyzous-recessive
        0.25--0.75 ~~ heterozygous
        0.75--1.00 ~~ homozygous-dominant
    for each of 50-60 SNP locii on the genome.
    methylcheck can plot these to genotype samples.

Notes:
    Each data container has a ctrl_red and ctrl_green dataframe.
    This shuffles them into a single dictionary of dataframes with keys as Sample_ID matching the meta_data.
    Where Sample_ID is:
        {container.sentrix_id_sentrix_position}
    and each dataframe contains both the ctrl_red and ctrl_green data. Column names will specify which channel
    (red or green) the data applies to.

    snp values are stored in container.snp_methylated.data_frame
    and container.snp_unmethylated.data_frame

    methylcheck.plot_controls requires these columns in output: ['Control_Type','Color','Extended_Type']
        copy from container.ctl_man

    v1.5.0+ saves either RAW or NOOB meth/unmeth SNP values, depending on whether NOOB/dye was processed.
    """
    RED = container.ctrl_red.rename(columns={'mean_value': 'Mean_Value_Red'})[['Mean_Value_Red']]
    GREEN = container.ctrl_green.rename(columns={'mean_value': 'Mean_Value_Green'})[['Mean_Value_Green']]
    CONTROL = RED.join(GREEN)
    CONTROL = pd.concat([CONTROL, container.ctl_man], axis=1)

    meth_col = 'Meth' if container.do_noob == False else 'noob_Meth'
    unmeth_col = 'Unmeth' if container.do_noob == False else 'noob_Unmeth'
    SNP = container.snp_methylated.rename(columns={meth_col: 'snp_meth'})
    SNP_UNMETH = container.snp_unmethylated.rename(columns={unmeth_col: 'snp_unmeth'})[['snp_unmeth']]
    SNP = SNP.join(SNP_UNMETH)
    # below (snp-->beta) is analogous to:
    # SampleDataContainer._postprocess(input_dataframe, calculate_beta_value, 'beta_value')
    # except that it doesn't use the predefined noob columns.
    vectorized_func = np.vectorize(calculate_beta_value)
    SNP['snp_beta'] = vectorized_func(
        SNP['snp_meth'].values,
        SNP['snp_unmeth'].values,
    )
    SNP = SNP[['snp_beta','snp_meth','snp_unmeth']]

    # space saving, but catches NaN bugs without breaking.
    if SNP[['snp_meth','snp_unmeth']].isna().sum().sum() == 0:
        SNP['snp_meth'] = SNP['snp_meth'].apply(pd.to_numeric, downcast='unsigned')
        SNP['snp_unmeth'] = SNP['snp_unmeth'].apply(pd.to_numeric, downcast='unsigned')
    if CONTROL[['Mean_Value_Green','Mean_Value_Red']].isna().sum().sum() == 0:
        CONTROL['Mean_Value_Green'] = CONTROL['Mean_Value_Green'].apply(pd.to_numeric, downcast='unsigned')
        CONTROL['Mean_Value_Red'] = CONTROL['Mean_Value_Red'].apply(pd.to_numeric, downcast='unsigned')

    CONTROL = CONTROL.join(SNP, how='outer').round({'snp_beta':3})
    # finally, copy 'design' col from manifest, if exists
    if 'design' in container.man.columns:
        probe_designs = container.man[['design']]
        CONTROL = CONTROL.join(probe_designs, how='left')
    return CONTROL


def consolidate_mouse_probes(data_containers, filename_or_fileobj, file_format, object_name='mouse_data_frame', poobah_column='poobah_pval', poobah_sig=0.05):
    """ these probes have 'Multi'|'Random' in `design` col of mouse manifest. used to populate 'mouse_probes.pkl'.
    pre v1.4.6: ILLUMINA_MOUSE specific probes (starting with 'rp' for repeat sequence or 'mu' for murine, 'uk' for unknown-experimental)
    stored as data_container.mouse_data_frame.

    saves as a dataframe just like controls:
        a dict of dataframes like processed.csv format, but only mouse probes.
        keys are sample_ids -- values are dataframes"""
    out = dict()
    for idx,sample in enumerate(data_containers):
        sample_id = f"{sample.sample.sentrix_id}_{sample.sample.sentrix_position}"
        data_frame = getattr(sample, object_name)
        data_frame = data_frame.round({'noob_meth':0, 'noob_unmeth':0, 'm_value':3, 'beta_value':3, 'cm_value':3,
            'meth':0, 'unmeth':0, 'poobah_pval':3})
        out[sample_id] = data_frame

    if is_file_like(filename_or_fileobj): # and file_format == 'pickle':
        pickle.dump(out, filename_or_fileobj)
    #elif is_file_like(filename_or_fileobj) and file_format == 'parquet':

    else: #except TypeError: # File must have a write attribute
        with open(filename_or_fileobj, 'wb') as f:
            pickle.dump(out, f)
    return

def merge_batches(num_batches, data_dir, filepattern, file_format):
    """for each of the output pickle file types,
    this will merge the _1, _2, ..._X batches into a single file in data_dir.
    """
    dfs = []
    suffix = 'parquet' if file_format == 'parquet' else 'pkl'
    for num in range(num_batches):
        try:
            filename = f"{filepattern}_{num+1}.{suffix}"
            part = Path(data_dir, filename)
            if part.exists() and file_format == 'parquet':
                dfs.append( pd.read_parquet(part) )
            elif part.exists():
                dfs.append( pd.read_pickle(part) )
        except Exception as e:
            LOGGER.error(f'error merging batch {num} of {filepattern}')
    if dfs: # pipeline passes in all filenames, but not all exist
        dfs = pd.concat(dfs, axis='columns', join='inner')
        outfile_name = Path(data_dir, f"{filepattern}.{suffix}")
        LOGGER.info(f"{filepattern}: {dfs.shape}")
        func = dfs.to_parquet if file_format == 'parquet' else dfs.to_pickle
        func(str(outfile_name))
        del dfs # save memory.

        # confirm file saved ok.
        if not Path(outfile_name).exists():
            print("error saving consolidated file: {outfile_name}; use methylcheck.load() to merge the parts")
            return

    # now delete the parts
    for num in range(num_batches):
        filename = f"{filepattern}_{num+1}.{suffix}"
        part = Path(data_dir, filename)
        if part.exists():
            part.unlink() # delete it