Source code for methylprep.models.sigset

# Lib
import logging
import pandas as pd
import numpy as np
# App
from ..models import (
    ArrayType,
    Channel,
    ProbeType,
)
from ..files import IdatDataset
from ..utils.progress_bar import * # checks environment and imports tqdm appropriately.
from collections import Counter


__all__ = ['SigSet', 'parse_sample_sheet_into_idat_datasets', 'RawMetaDataset']


LOGGER = logging.getLogger(__name__)

[docs]def get_array_type(idat_dataset_pairs):
    """ provide a list of idat_dataset_pairs and it will return the array type, confirming probe counts match in batch. """
    array_types = {dataset['array_type'] for dataset in idat_dataset_pairs}
    if len(array_types) == 0:
        raise ValueError('could not identify array type from IDATs')
    elif len(array_types) != 1:
        raise ValueError('IDATs with varying array types')
    array_type = array_types.pop()
    return array_type

[docs]class RawMetaDataset():
    """Wrapper for a sample and meta data, without its pair of raw IdatDataset values."""
    def __init__(self, sample):
        self.sample = sample

[docs]def parse_sample_sheet_into_idat_datasets(sample_sheet, sample_name=None, from_s3=None, meta_only=False, bit='float32'):
    """Generates a collection of IdatDatasets from samples in a sample sheet.

    Arguments:
        sample_sheet {SampleSheet} -- The SampleSheet from which the data originates.

    Keyword Arguments:
        sample_name {string} -- Optional: one sample to process from the sample_sheet. (default: {None})
        from_s3 {zip_reader} -- pass in a S3ZipReader object to extract idat files from a zipfile hosted on s3.
        meta_only {True/False} -- doesn't read idat files, only parses the meta data about them.
        (RawMetaDataset is same as RawDataset but has no idat probe values stored in object, because not needed in pipeline)

    Raises:
        ValueError: If the number of probes between raw datasets differ.

    Returns:
        [RawDatasets] -- A list of idat data pairs, each a dict like {'green_idat': green_idat, 'red_idat': red_idat}
    """
    # now idat_datasets is not a class, but just a list of dicts, with each dict being a pair of red_idat and green_idat Objects.
    if not sample_name:
        samples = sample_sheet.get_samples()
    elif type(sample_name) is list:
        samples = [
            sample_sheet.get_sample(sample)
            for sample in sample_name
        ]
    else:
        samples = [sample_sheet.get_sample(sample_name)]
        LOGGER.info("Found sample in SampleSheet: {0}".format(sample_name))

    #LOGGER.info(f'Reading {len(samples)} IDATs from sample sheet')
    if from_s3 and meta_only:
        parser = RawMetaDataset
        idat_datasets = [parser(sample) for sample in samples]
    elif from_s3 and not meta_only:
        #parser = RawDataset.from_sample_s3
        zip_reader = from_s3
        def parser(zip_reader, sample):
            green_filepath = sample.get_filepath('idat', Channel.GREEN)
            green_idat = IdatDataset(green_filepath, channel=Channel.GREEN, bit=bit)
            red_filepath = sample.get_filepath('idat', Channel.RED)
            red_idat = IdatDataset(red_filepath, channel=Channel.RED, bit=bit)
            return {'green_idat': green_idat, 'red_idat': red_idat, 'sample': sample}
        idat_datasets = []
        for sample in tqdm(samples, total=len(samples), desc='Reading IDATs'):
            idat_datasets.append(parser(zip_reader, sample))
    elif not from_s3 and not meta_only:
        #parser = RawDataset.from_sample
        def parser(sample):
            green_filepath = sample.get_filepath('idat', Channel.GREEN)
            green_idat = IdatDataset(green_filepath, channel=Channel.GREEN, bit=bit)
            red_filepath = sample.get_filepath('idat', Channel.RED)
            red_idat = IdatDataset(red_filepath, channel=Channel.RED, bit=bit)
            return {'green_idat': green_idat, 'red_idat': red_idat, 'sample': sample}
        idat_datasets = []
        for sample in tqdm(samples, total=len(samples), desc='Reading IDATs'):
            idat_datasets.append(parser(sample))

    if not meta_only:
        idat_datasets = list(idat_datasets) # tqdm objects are not subscriptable, not like a real list
        # ensure all idat files have same number of probes
        batch_probe_counts = set()
        counts_per_sample = Counter()
        for idx,dataset in enumerate(idat_datasets):
            snps_read = {dataset['green_idat'].n_snps_read, dataset['red_idat'].n_snps_read}
            if len(snps_read) > 1:
                raise ValueError('IDAT files have a varying number of probes (compared Grn to Red channel)')
            n_snps_read = snps_read.pop()
            batch_probe_counts.add(n_snps_read)
            counts_per_sample[n_snps_read] += 1
            idat_datasets[idx]['array_type'] = ArrayType.from_probe_count(n_snps_read)
        if len(batch_probe_counts) != 1:
            array_types = Counter([dataset['array_type'] for dataset in idat_datasets])
            LOGGER.warning(f"These IDATs have varying numbers of probes: {counts_per_sample.most_common()} for these array types: {array_types.most_common()}")
            LOGGER.warning(f"(Processing will drop any probes that are not found across all samples for a given array type.)")
    return idat_datasets


[docs]class SigSet():
    """
    I’m gonna try to create a fresh methylprep “SigSet” to replace our methylationDataset and RawDataset objects, which are redundant, and even have redundant functions within them. Part of why I have been frustrated/confused by our code.
    Central to the SeSAMe platform is the SigSet data structure, an S4 class with slots containing signals for six different classes of probes:
    [x] II - Type-II probes;
    [x] IR - Type-I Red channel probes;
    [x] IG - Type-I Grn channel probes;
    [x] oobG - Out-of-band Grn channel probes (matching Type-I Red channel probes in number);
    [x] oobR - Out-of-band Red channel probes (matching Type-I Grn channel probes in number);
    [x] ctrl_green, ctrl_red - control probes.
    [x] methylated, unmethylated, snp_methylated, snp_unmethylated
    [x] fg_green, fg_red (opposite of oobG and oobR) AKA ibG, ibR for in-band probes.

    - just tidying up how we access this stuff, and trying to stick to IlmnID everywhere because the illumina_id within IDAT files is no longer unique as a ref.
    - I checked again, and no other array breaks these rules. But sounds like Bret won’t stick to this pattern going forward with designs. So I suspect other software will break with new arrays, unless they rewrite for this too.

    - this combines every layer of objects between IdatDatasets and SampleDataContainers.
    - this avoids looping through probe subsets, instead referring to a lookup-dataframe of how these relate.
    - avoids probes.py
        probe_type is a derived label, not in manifest (I, II, SnpI, SnpII, control)
    """
    __bg_corrected = False
    __minfi_noob = False # linear_dye applied
    __dye_bias_corrected = False
    __preprocessed = False

    # we found the SECRET DECODER RING!
    #name | data_channel | probe_address | probe_channel | probe_type | fg_green | fg_red | meth | unmeth | snp_meth | snp_unmeth
    # 'data_channel' refers to either the green_idat red_idat values; 'Color_Channel' + probe_address is used by type-I probes to refer to one of two measurements within that probe.
    df_columns=['name','data_channel','probe_address','Color_Channel','probe_type',
        'fg_green', 'fg_red', 'meth', 'unmeth', 'snp_meth', 'snp_unmeth', 'foreground', 'snp', 'Infinium_Design_Type']
    data=[ # in-band probes
        ['II-None-Green-Meth', 'GREEN', 'AddressA_ID', None, 'II', 1, 0, 1, 0, 0, 0, 1, 0, 'II'],
        ['IG-A-Unmeth', 'GREEN', 'AddressA_ID', 'Grn', 'I', 1, 0, 0, 1, 0, 0, 1, 0, 'I'],
        ['IG-B-Meth', 'GREEN', 'AddressB_ID', 'Grn', 'I', 1, 0, 1, 0, 0, 0, 1, 0, 'I'],
        ['II-None-Red-Unmeth', 'RED',   'AddressA_ID', None, 'II', 0, 1, 0, 1, 0, 0, 1, 0, 'II'],
        ['IR-A-Unmeth', 'RED', 'AddressA_ID', 'Red', 'I', 1, 0, 0, 1, 0, 0, 1, 0, 'I'],
        ['IR-B-Meth', 'RED', 'AddressB_ID', 'Red', 'I', 1, 0, 1, 0, 0, 0, 1, 0, 'I'],
        # in-band SNPS
        ['SnpII-None-Green-Meth', 'GREEN', 'AddressA_ID', None, 'SnpII', 1, 0, 0, 0, 1, 0, 1, 1, 'II'],
        ['SnpIG-A-Unmeth', 'GREEN', 'AddressA_ID', 'Grn', 'SnpI', 1, 0, 0, 0, 0, 1, 1, 1, 'I'],
        ['SnpIG-B-Meth', 'GREEN', 'AddressB_ID', 'Grn', 'SnpI', 1, 0, 0, 0, 1, 0, 1, 1, 'I'],
        ['SnpII-None-Red-Unmeth', 'RED', 'AddressA_ID', None, 'SnpII', 0, 1, 0, 0, 0, 1, 1, 1, 'II'],
        ['SnpIR-A-Unmeth', 'RED', 'AddressA_ID', 'Red', 'SnpI', 1, 0, 0, 0, 0, 1, 1, 1, 'I'],
        ['SnpIR-B-Meth', 'RED', 'AddressB_ID', 'Red', 'SnpI', 1, 0, 1, 0, 0, 0, 1, 1, 'I'],
        # oob subsets -- these should include Snp-oob probes, but they need to be addressed separately.
        ['oobG-Unmeth', 'GREEN', 'AddressA_ID', 'Red', 'I', 0, 0, 0, 1, 0, 0, 0, 0, 'I'],
        ['oobG-Meth',   'GREEN', 'AddressB_ID', 'Red', 'I', 0, 0, 1, 0, 0, 0, 0, 0, 'I'],
        ['oobR-Unmeth', 'RED', 'AddressA_ID', 'Grn', 'I', 0, 0, 0, 1, 0, 0, 0, 0, 'I'],
        ['oobR-Meth',   'RED', 'AddressB_ID', 'Grn', 'I', 0, 0, 1, 0, 0, 0, 0, 0, 'I'],
        ['SnpIG-oobG-Unmeth', 'GREEN', 'AddressA_ID', 'Red', 'SnpI', 0, 0, 0, 0, 0, 1, 0, 1, 'I'],
        ['SnpIG-oobG-Meth',   'GREEN', 'AddressB_ID', 'Red', 'SnpI', 0, 0, 0, 0, 1, 0, 0, 1, 'I'],
        ['SnpIR-oobR-Unmeth', 'RED', 'AddressA_ID', 'Grn', 'SnpI', 0, 0, 0, 0, 0, 1, 0, 1, 'I'],
        ['SnpIR-oobR-Meth',   'RED', 'AddressB_ID', 'Grn', 'SnpI', 0, 0, 0, 0, 1, 0, 0, 1, 'I']]

    idat_decoder = pd.DataFrame(data=data, columns=df_columns).set_index('name')

    # how to decode the idat into logical chunks for processing; sesame II/IG/IR includes snps by default; mprep separates snps from meth/unmeth/fg_red/fg_green
    subsets = {
        'II': ['II-None-Green-Meth', 'II-None-Red-Unmeth', 'SnpII-None-Green-Meth', 'SnpII-None-Red-Unmeth'],
        'IG': ['IG-A-Unmeth', 'IG-B-Meth', 'SnpIG-A-Unmeth', 'SnpIG-B-Meth'],
        'IR': ['IR-A-Unmeth', 'IR-B-Meth', 'SnpIR-A-Unmeth', 'SnpIR-B-Meth'],
        'oobG': ['oobG-Unmeth','oobG-Meth', 'SnpIG-oobG-Unmeth', 'SnpIG-oobG-Meth'],
        'oobR': ['oobR-Unmeth','oobR-Meth', 'SnpIR-oobR-Unmeth', 'SnpIR-oobR-Meth'],
        'methylated': ['II-None-Green-Meth', 'IG-B-Meth', 'IR-B-Meth', 'SnpII-None-Green-Meth', 'SnpIG-B-Meth', 'SnpIR-B-Meth'],
        'unmethylated': ['II-None-Red-Unmeth', 'IG-A-Unmeth', 'IR-A-Unmeth', 'SnpII-None-Red-Unmeth', 'SnpIG-A-Unmeth', 'SnpIR-A-Unmeth'],
        'snp_methylated': ['SnpII-None-Green-Meth', 'SnpIG-B-Meth', 'SnpIR-B-Meth'],
        'snp_unmethylated': ['SnpII-None-Red-Unmeth', 'SnpIG-A-Unmeth', 'SnpIR-A-Unmeth'],
        #'fg_green': ['II-None-Green-Meth', 'IG-A-Unmeth', 'IG-B-Meth'],
        #'fg_red':   ['II-None-Red-Unmeth', 'IR-A-Unmeth', 'IR-B-Meth'],
        # ibG is fg_green plus the SNPs
        'ibG': ['II-None-Green-Meth', 'IG-A-Unmeth', 'IG-B-Meth', 'SnpIG-A-Unmeth', 'SnpIG-B-Meth', 'SnpII-None-Green-Meth'],
        'ibR': ['II-None-Red-Unmeth', 'IR-A-Unmeth', 'IR-B-Meth', 'SnpIR-A-Unmeth', 'SnpIR-B-Meth', 'SnpII-None-Red-Unmeth'],
        # 'ctrl_green' and 'ctrl_red' are defined attributes below, because they use a separate manifest and addressing system.
    }
    # after __init__, SigSet will have class variables for each of the keys in subsets above.

    def __init__(self, sample, green_idat, red_idat, manifest, debug=False):
        """ green_idat has .probe_means and .meta as main functions
        and for extra info, use extra kwargs:
        red= m.files.IdatDataset('9247377093_R02C01_Red.idat', m.models.Channel.RED, verbose=True, std_dev=True, nbeads=True)
        """
        self.debug = debug
        snps_read = {green_idat.n_snps_read, red_idat.n_snps_read}
        if len(snps_read) > 1:
            raise ValueError('IDAT files have a varying number of probes (comparing Grn to Red channel)')
        if (str(green_idat.channel) != 'Grn' or str(red_idat.channel) != 'Red'):
            raise ValueError("The IDAT files you supplied seem to be reversed. Check the order of your inputs to SigSet")
        self.n_snps_read = snps_read.pop()
        # DEBUG
        #self.array_type = ArrayType.from_probe_count(self.n_snps_read)

        # these next two should be unnecessary, because nothing should be reading idats downstream; use self.data_channel instead
        #self.green_idat = green_idat
        #self.red_idat = red_idat
        self.data_channel = {'GREEN': green_idat.probe_means, 'RED': red_idat.probe_means} # indexed to illumina_ids
        # illumina_ids are all II means, plus a stacked list of type-I-AddressA and type-I-AddressB means
        self.sample = sample
        self.man = manifest.data_frame # relevant columns are 'probe_type', AddressA_ID, AddressB_ID, index, Color_Channel
        self.man = self.man[ ~self.man.index.str.startswith('rs') ] # snp_man covers these
        self.snp_man = manifest.snp_data_frame.set_index('IlmnID')
        self.ctl_man = manifest.control_data_frame
        self.ctrl_green = self.ctl_man.merge(
            green_idat.probe_means.astype('float32'),
            how='inner', left_index=True, right_index=True)
        self.ctrl_red = self.ctl_man.merge(
            red_idat.probe_means.astype('float32'),
            how='inner', left_index=True, right_index=True)
        self.array_type = manifest.array_type
        if self.array_type == ArrayType.ILLUMINA_MOUSE:
            self.mouse_probes_mask = ( (self.man['design'] == 'Multi')  | (self.man['design'] == 'Random') )
        else:
            self.mouse_probes_mask = None
        self.address_code = {'AddressA_ID':'A', 'AddressB_ID':'B', 'A':'AddressA_ID', 'B':'AddressB_ID'}
        """
        ## SigSet EPIC
        ##  - @IG probes: 49989 - 332 4145 70 7094 599 2958 ...
        ##  - @IR probes: 92294 - 183 8040 1949 6152 833 89 ...
        ##  - @II probes: 724612 - 6543 1596 3133 1011 3035 2837 ...
        ##  - @oobG probes: 92294 - 138 277 107 218 232 80 ...
        ##  - @oobR probes: 49989 - 1013 150 81 910 448 183 ...
        ##  - @ctl probes: 635 ...
        ##  - @pval: 866895 - 0.005141179 0.04914081 0.002757492 ...

        SigSet 450k
        @II 350076 ................... methylated   485512
        @IG 46298 ... oobR 46298 ..... unmethylated 485512
        @IR 89203 ... oobG 89203 ..... snp_methylated   65
        .............................. snp_unmethylated 65
        fg_green 396325 |vs| ibG 396374 (incl 40 + 9 SNPs)  --(flattened)--> 442672
        fg_red   439223 |vs| ibR 439279 (incl 40 + 16 SNPs) --(flattened)--> 528482
        """

        if debug: print('DEBUG comparing [manifest probe_IDs vs idat probe_means]')

        for subset, decoder_parts in self.subsets.items():
            data_frames = {}
            for part in decoder_parts:
                i = self.idat_decoder.loc[part]
                ref = self.snp_man.copy() if i['snp'] == 1 else self.man.copy()
                # can't merge on NAType, so filling in -1s. No probe_means illumina_ids will match -1
                # using -1 instead of NaN throughout solves a lot of problems!
                if ref['AddressA_ID'].isna().sum() > 0:
                    ref.loc[:, 'AddressA_ID'].fillna(-1, inplace=True)
                if ref['AddressB_ID'].isna().sum() > 0:
                    #ref.loc[ (ref['AddressB_ID'].isnull()), 'AddressB_ID'] = -1
                    ref.loc[:, 'AddressB_ID'].fillna(-1, inplace=True)
                # and pandas won't compare NaN to NaN... so need this extra color_channel filter
                color_channel = ref['Color_Channel'].isna() if i['Color_Channel'] is None else (ref['Color_Channel'] == i['Color_Channel'])
                probe_ids = ref[ (ref['Infinium_Design_Type'] == i['Infinium_Design_Type']) & (color_channel) ][i['probe_address']]
                probe_means = self.data_channel[i['data_channel']] # starts with all 361821 mouse probes here, keyed to illumina_ids
                probe_means = probe_means.reset_index() # index is Nth row; illumina_id now a column that can be redundant
                probe_means = probe_means[ probe_means.illumina_id.isin(probe_ids) ]
                if len(probe_ids) == 0:
                    LOGGER.error(f"SigSet.init(): no probes matched for {subset}:{part}")
                #************ DEBUG ***********#
                if debug:
                    #print(f"DEBUG duplicated probe_ids (from manifest): {len( probe_ids[probe_ids.duplicated(keep=False)] )}")
                    duped = len( probe_ids[probe_ids.duplicated(keep=False)] )
                    dupe_msg = f"-- {duped} multiprobes" if duped != 0 else ''
                    means_msg = probe_means.shape if probe_means.shape[0] != probe_ids.shape[0] else 'OK'
                    print(f"DEBUG {subset} -- {part}: {probe_ids.shape} -- {means_msg} {dupe_msg}")
                # 2021-11-29: confirmed that all 361821 mouse means in IDAT DO get read. 4622 of these are control probes, but
                # methylprep only uses 633 of them (matching 635 EPIC probes for QC).
                # 919 of these probes are duplicates having the same illumina_id but different IlmnIDs (TC11, TC12, TC13 etc..) that dont merge right.
                #if subset == 'methylated' and part == 'IR-B-Meth':

                #************ DEBUG ***********#
                # merge and establish IlmnIDs from illumina_ids here
                ref_probe_names = ref[['AddressA_ID', 'AddressB_ID']].reset_index()
                probe_subset_data = ref_probe_names.merge(probe_means, how='inner',
                    left_on=i['probe_address'],
                    right_on='illumina_id')
                probe_subset_data['used'] = self.address_code[i['probe_address']]
                mean_col_name = 'Meth' if 'Meth' in part else 'Unmeth'
                probe_subset_data = probe_subset_data.rename(columns={'mean_value': mean_col_name})
                probe_subset_data = probe_subset_data.drop(['illumina_id'], axis='columns')
                # looks like duplicated probes DO have different mean_values, so it works. nothing is lost.
                #if (probe_subset_data.IlmnID.duplicated().sum() > 0
                #    or probe_subset_data.IlmnID.isna().sum() > 0
                #    or probe_subset_data[mean_col_name].isna().sum() > 0
                #    or probe_subset_data['AddressA_ID'].duplicated().sum() > 0
                #    or ('II' not in part and probe_subset_data['AddressB_ID'].duplicated().sum() > 0)):
                #    print('ERROR')
                #    import pdb;pdb.set_trace()
                probe_subset_data = probe_subset_data.set_index('IlmnID')
                data_frames[part] = probe_subset_data

            try:
                # here, put the meth and unmeth parts into separate columns as we combine
                meth_parts = [frame for frame in data_frames.values() if 'Meth' in frame.columns]
                unmeth_parts = [frame for frame in data_frames.values() if 'Unmeth' in frame.columns]
                if unmeth_parts == []:
                    data_frame = pd.concat(meth_parts)
                    data_frame['Unmeth'] = None
                elif meth_parts == []:
                    data_frame = pd.concat(unmeth_parts)
                    data_frame['Meth'] = None
                else:
                    data_frame = pd.concat(meth_parts)
                    # need to keep NaNs in Meth / Unmeth when merging, so 'outer'
                    test_data_frame = data_frame.merge(pd.concat(unmeth_parts)[['Unmeth']], left_index=True, right_index=True, how='inner')
                    data_frame = data_frame.merge(pd.concat(unmeth_parts)[['Unmeth']], left_index=True, right_index=True, how='outer')
                    #if len(data_frame) != len(test_data_frame):
                    #    print(f"---- DEBUG {subset} {data_frame.shape} vs {test_data_frame.shape}")
                    # -- this explained by having NaNs in either Meth/Unmeth channel
                if debug:
                    print(subset, len(data_frame))
                setattr(self, subset, data_frame)
            except Exception as e:
                raise Exception(f"SigSet: {e}")

        self.starting_probe_counts = {subset: getattr(self, subset).shape[0] for subset in self.subsets.keys()} # DEBUGGING
        self.detect_and_drop_duplicates()
        if debug: self.check_for_probe_loss()

    # originally was `set_bg_corrected` from MethylationDataset | called by NOOB
[docs]    def update_probe_means(self, noob_green, noob_red, red_factor=None):
        """ pass in two dataframes (green and red) with IlmnIDs in index and a 'bg_corrected' column in each.

        because __init__ has created each subset as a dataframe with IlmnID in index, this matches to index.
        and uses decoder to parse whether 'Meth' or 'Unmeth' values get updated.

        upstream: container.sigset.update_probe_means(noob_green, noob_red)

        replaces 'bg_corrected' column with 'noob_Meth' or 'noob_Unmeth' column.

        does NOT update ctrl_red or ctrl_green; these are updated within the NOOB function because structually different.
        """

        for probe_subset, decoder_parts in self.subsets.items():
            if self.debug: print(f'--- probe_subset {probe_subset} ---')
            df = getattr(self, probe_subset) # contains multiple parts in one dataframe, so need to update with logic each time
            df = df.assign(noob_Meth=None)
            df['noob_Meth'] = df['noob_Meth'].astype('float32')
            df = df.assign(noob_Unmeth=None)
            df['noob_Unmeth'] = df['noob_Unmeth'].astype('float32')
            # need to assign new values to red and green channels separately, and decode into meth/unmeth column
            for part in decoder_parts:
                try:
                    #**** NOOB is matching using tango A/B IDs upstream, must change. ****#
                    # assuming probes are passed in indexed to IlmnIDs, we can merge/update whatever matches
                    i = self.idat_decoder.loc[part]
                    if i['meth'] == 1:
                        column = 'Meth'
                        bg_column = 'noob_Meth'
                    if i['unmeth'] == 1:
                        column = 'Unmeth'
                        bg_column = 'noob_Unmeth'
                    if i['snp_meth'] == 1:
                        column = 'Meth'
                        bg_column = 'noob_Meth'
                    if i['snp_unmeth'] == 1:
                        column = 'Unmeth'
                        bg_column = 'noob_Unmeth'

                    if i['data_channel'] == 'GREEN':
                        updated = noob_green.copy()
                    elif i['data_channel'] == 'RED':
                        updated = noob_red.copy()
                        if red_factor is not None:
                            # NOTE: this changes None to NaN and dtype is Object
                            updated['bg_corrected'] = (updated['bg_corrected'] * red_factor).round(0)

                    # what IlmnIDs are in this subset?
                    ref = self.snp_man if i['snp'] == 1 else self.man
                    # and pandas won't compare NaN to NaN... so need this extra color_channel filter
                    color_channel = ref['Color_Channel'].isna() if i['Color_Channel'] is None else ref['Color_Channel'] == i['Color_Channel']
                    IlmnIDs = ref[ (ref['Infinium_Design_Type'] == i['Infinium_Design_Type']) & (color_channel) ].index

                    # NOT SURE ABOUT THIS HACK. IT WORKS, but why?
                    if probe_subset in ('oobG','oobR'):
                        # swap the data_channels
                        updated = noob_red.copy() if probe_subset == 'oobG' else noob_green.copy()
                        # ... and optionally, grab the IlmnIDs from df and match them instead. Wasn't necessary. <-- this is prone to mismatching across probe_subsets.
                        #IlmnIDs = set(list(df.sort_index().index)) & set(list(updated['IlmnID']))

                    # use update; cannot merge if the IlmnIDs overlap, to avoid NaNs and bg_corrected_x/_y cols
                    updated = updated.rename(columns={'bg_corrected':bg_column})
                    updated = updated.loc[ updated['IlmnID'].isin(IlmnIDs)]
                    # update() matches on IlmnID index, but noob_green/red have multiple meth/unmeth values per IlmnID so have to split it apart.
                    if updated['IlmnID'].duplicated().sum() != 0:
                        #raise AssertionError(f"{probe_subset} -- {part} contains duplicate IlmnIDs, after filtering")
                        # --- there are duplicate IlmnIDs here, so can't index to it until I split the M and U derived parts out
                        if 'Meth' in part:
                            used = 'M'
                        elif 'Unmeth' in part:
                            used = 'U'
                        else:
                            raise ValueError(f"Not sure how to read {part}; is it Meth or Unmeth?")
                        updated = updated.loc[ updated['used'] == used ]
                        if updated['IlmnID'].duplicated().sum() != 0:
                            raise AssertionError(f"{probe_subset} -- {part} -- {used} contains duplicate IlmnIDs, after filtering twice")
                    updated = updated.set_index('IlmnID')
                    debug_pre = df[bg_column].isna().sum()
                    df.update(updated[[bg_column]])
                    debug_post = df[bg_column].isna().sum()
                    num_updated = debug_pre - debug_post
                    if self.debug: print(f"{part} {df.shape[0]} (+{num_updated})")
                    # overwrite!
                    setattr(self, probe_subset, df)
                except Exception as e:
                    print(f"**** Sigset: error in update_probe_means: {e} ***")
            if self.starting_probe_counts.get(probe_subset) != getattr(self, probe_subset).shape[0]:
                if self.debug: LOGGER.warning(f"Update probes: {probe_subset} count changed from {self.starting_probe_counts.get(probe_subset)} to {getattr(self, probe_subset).shape[0]}")

        self.__preprocessed = True # applied by set_noob
        self.__bg_corrected = True
        self.__minfi_noob = False
        self.__linear_dye = True if red_factor is not None else False

    """
    # from raw_dataset; may no longer be needed, but kept for testing against new approach 2021
    def get_oob_controls(self, green_idat, red_idat, manifest, include_rs=True):
        ''' Out-of-bound controls are the mean intensity values for the
        channel in the opposite channel's probes (IG oob and IR oob)

.. todo::
    TEST -- does this give same output as SigSet.oobG and oobR?
        '''
        param_sets = [
            {'channel': Channel.RED, 'idat': green_idat, 'manifest': manifest},
            {'channel': Channel.GREEN, 'idat': red_idat, 'manifest': manifest},
        ]

        for channel_params in param_sets:
            channel = channel_params['channel']
            idat_dataset = channel_params['idat']
            manifest = channel_params['manifest']
            probe_means = idat_dataset.probe_means # index matches AddressA_ID or AddressB_ID, depending on RED/GREEN channel

            probes = manifest.get_probe_details(
                probe_type=ProbeType.ONE, # returns IR or IG cgxxxx probes only
                channel=channel,
            )[['AddressA_ID', 'AddressB_ID']]
            if include_rs:
                snp_probes = manifest.get_probe_details(
                    probe_type=ProbeType.SNP_ONE,
                    channel=channel,
                )[['AddressA_ID', 'AddressB_ID']]
                probes = pd.concat([probes, snp_probes])

            if channel == Channel.RED:
                oobG = probes.merge(
                    probe_means, # green channel X AddresB (meth) channel
                    how='inner',
                    left_on='AddressB_ID',
                    right_index=True,
                    suffixes=(False, False),
                ).rename(columns={'mean_value': 'Meth'})
                oobG = oobG.merge(
                    probe_means, # green channel X AddresA (unmeth) channel
                    how='inner',
                    left_on='AddressA_ID',
                    right_index=True,
                    suffixes=(False, False),
                ).rename(columns={'mean_value': 'Unmeth'}).sort_values('IlmnID')
                oobG.drop(['AddressA_ID', 'AddressB_ID'], axis=1)

            if channel == Channel.GREEN:
                oobR = probes.merge(
                    probe_means, # red channel X AddressB for (meth)
                    how='inner',
                    left_on='AddressB_ID',
                    right_index=True,
                    suffixes=(False, False),
                ).rename(columns={'mean_value': 'Meth'}).sort_values('IlmnID')
                oobR = oobR.merge(
                    probe_means, # red channel X AddressA for (unmeth)
                    how='inner',
                    left_on='AddressA_ID',
                    right_index=True,
                    suffixes=(False, False),
                ).rename(columns={'mean_value': 'Unmeth'})
                oobR.drop(['AddressA_ID', 'AddressB_ID'], axis=1)
        return (oobG.sort_index(), oobR.sort_index())
    """

    # from raw_dataset
    def filter_oob_probes(self, channel, manifest, idat_dataset, include_rs=True):
        raise KeyError("filter_oob_probes replaced by (is part of) SigSet.get_oob_controls in v1.5+")

    # from MethylationDataset | called by self.set_bg_corrected for each probe subset | now part of set_bg_corrected
    # from MethylationDataset | now part of update_probe_means
[docs]    def set_noob(self, red_factor):
        """ same method as update_probe_means, but simply applies a linear correction to all RED channel values """
        #update_probe_means(self, noob_green, noob_red, red_factor)
        raise KeyError("set_noob replaced by update_probe_means in v1.5+")

[docs]    def detect_and_drop_duplicates(self):
        """ as of v1.5.0, mouse manifest includes a few probes that cause duplicate values, and breaks processing.
        So this removes them. About 5 probes in all.

        Note: This runs during SigSet__init__,
        and might fail if any of these probes are affected by inter_type_I_probe_switch(),
        which theoretically should never happen in mouse. But infer-probes affects the idat probe_means directly,
        and runs before SigSet is created in SampleDataContainer, to avoid double-reading confusion.
        """
        probe_count = 0
        # (1) look for dupes within a subset; mouse.methylated has 2 to drop
        for subset in self.subsets:
            this = getattr(self, subset)
            if this.index.duplicated().sum() > 0:
                pre = this.index.duplicated().sum()
                this = this.loc[ ~this.index.duplicated() ]
                setattr(self, subset, this)
                this = getattr(self, subset)
                probe_count += pre
                if self.debug:
                    LOGGER.info(f"Dropped duplicate probes from SigSet.{subset}: {pre} --> {this.index.duplicated().sum()}")

        # (2) look between paired subsets; the index probe names should match exactly.
        # but if idat probe_means is missing for one or the other (AddressA_ID / AddressB_ID error?)
        # drop these. mouse has 3 to drop.
        matched_sets = [
            ('methylated','unmethylated')
        ]
        # either remove the mismatched ones, or add in missing values to other datasets (assume min fluor of 1.0)
        for partA,partB in matched_sets:
            if set(getattr(self, partA).index) - set(getattr(self, partB).index) != set():
                if self.debug:
                    LOGGER.info(f"mismatched probes ({partA} - {partB}): {set(getattr(self, partA).index) - set(getattr(self, partB).index)}")
                this = getattr(self, partA)
                mismatched = list(set(getattr(self, partA).index) - set(getattr(self, partB).index))
                this = this.loc[ ~this.index.isin(mismatched) ]
                setattr(self, partA, this)
            if set(getattr(self, partB).index) - set(getattr(self, partA).index) != set():
                probe_count += len(set(getattr(self, partB).index) - set(getattr(self, partA).index))
                if self.debug:
                    LOGGER.info(f"mismatched probes ({partB} - {partA}): {set(getattr(self, partB).index) - set(getattr(self, partA).index)}")
                this = getattr(self, partB)
                mismatched = list(set(getattr(self, partB).index) - set(getattr(self, partA).index))
                this = this.loc[ ~this.index.isin(mismatched) ]
                setattr(self, partB, this)

[docs]    def check_for_probe_loss(self, stage=''):
        """Debugger runs this during processing to see where mouse probes go missing or get duplicated."""
        if stage != '' and self.debug:
            LOGGER.info(f"[{stage}]")
        for subset in self.subsets:
            if getattr(self, subset).index.duplicated().sum() > 0:
                if self.debug:
                    LOGGER.info(f"[ {getattr(self, subset).index.duplicated().sum()} duplicate probes on SigSet.{subset} ]")
            if getattr(self, subset).shape[0] != self.starting_probe_counts[subset]:
                if self.debug:
                    count_lost = self.starting_probe_counts[subset] - getattr(self, subset).shape[0]
                    LOGGER.info(f"[ {count_lost} probes lost from SigSet.{subset} ]")