Source code for methylprep.processing.preprocess

# Normal-exponential using out-of-band probes
# normex: negative control probes
# noob: ‘out-of-band’ Infinium I probes

# Lib
import logging
import numpy as np
import pandas as pd
from statsmodels import robust
from scipy.stats import norm, lognorm
# App
from ..models import ControlType, ArrayType
from ..models.sketchy_probes import qualityMask450, qualityMaskEPIC, qualityMaskEPICPLUS, qualityMaskmouse


__all__ = ['preprocess_noob']


LOGGER = logging.getLogger(__name__)


[docs]def preprocess_noob(container, offset=15, pval_probes_df=None, quality_mask_df=None, nonlinear_dye_correction=True, debug=False, unit_test_oob=False): # v1.4.5+ """ NOOB pythonized copy of https://github.com/zwdzwd/sesame/blob/master/R/background_correction.R - The function takes a SigSet and returns a modified SigSet with the background subtracted. - Background is modelled in a normal distribution and true signal in an exponential distribution. - The Norm-Exp deconvolution is parameterized using Out-Of-Band (oob) probes. - includes snps, but not control probes yet - output should replace the container instead of returning debug dataframes - II RED and II GREEN both have data, but manifest doesn't have a way to track this, so function tracks it. - keep IlmnID as index for meth/unmeth snps, and convert fg_green if nonlinear_dye_correction=True, this uses a sesame method in place of minfi method, in a later step. if unit_test_oob==True, returns the intermediate data instead of updating the SigSet/SampleDataContainer. """ if debug: print(f"DEBUG NOOB {debug} nonlinear_dye_correction={nonlinear_dye_correction}, pval_probes_df={pval_probes_df.shape if isinstance(pval_probes_df,pd.DataFrame) else 'None'}, quality_mask_df={quality_mask_df.shape if isinstance(quality_mask_df,pd.DataFrame) else 'None'}") # stack- need one long list of values, regardless of Meth/Uneth ibG = pd.concat([ container.ibG.reset_index().rename(columns={'Meth': 'mean_value'}).assign(used='M'), container.ibG.reset_index().rename(columns={'Unmeth': 'mean_value'}).assign(used='U') ]) ibG = ibG[ ~ibG['mean_value'].isna() ].drop(columns=['Meth','Unmeth']) ibR = pd.concat([ container.ibR.reset_index().rename(columns={'Meth': 'mean_value'}).assign(used='M'), #.drop(columns=['Meth','Unmeth']), container.ibR.reset_index().rename(columns={'Unmeth': 'mean_value'}).assign(used='U') #.drop(columns=['Meth','Unmeth']) ]) ibR = ibR[ ~ibR['mean_value'].isna() ].drop(columns=['Meth','Unmeth']) # out-of-band is Green-Unmeth and Red-Meth # exclude failing probes pval = pval_probes_df.loc[ pval_probes_df['poobah_pval'] > container.poobah_sig ].index if isinstance(pval_probes_df, pd.DataFrame) else [] qmask = quality_mask_df.loc[ quality_mask_df['quality_mask'] == 0 ].index if isinstance(quality_mask_df, pd.DataFrame) else [] # the ignored errors here should only be from probes that are both pval failures and qmask failures. Rmeth = list(container.oobR['Meth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore')) Runmeth = list(container.oobR['Unmeth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore')) oobR = pd.DataFrame( Rmeth + Runmeth, columns=['mean_value']) Gmeth = list(container.oobG['Meth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore')) Gunmeth = list(container.oobG['Unmeth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore')) oobG = pd.DataFrame( Gmeth + Gunmeth, columns=['mean_value']) # minfi test # ref fg_green = 442614 | vs ibG 442672 = 396374 + 46240 # ref fg_red = 528410 | vs ibR 528482 = 439279 + 89131 # ref oob_green = 178374 # ref oob_red = 92578 #oobR = pd.DataFrame( data={'mean_value': container.oobR['Meth']}) #oobG = pd.DataFrame( data={'mean_value': container.oobG['Unmeth']}) #print(f" oobR {oobR.shape} oobG {oobG.shape}") #import pdb;pdb.set_trace() debug_warnings = "" if oobR['mean_value'].isna().sum() > 0: debug_warnings += f" NOOB: oobG had {oobG['mean_value'].isna().sum()} NaNs" oobR = oobR.dropna() if oobG['mean_value'].isna().sum() > 0: debug_warnings += f" NOOB: oobG had {oobG['mean_value'].isna().sum()} NaNs" oobG = oobG.dropna() if ibG['mean_value'].isna().sum() > 0 or ibR['mean_value'].isna().sum() > 0: raise ValueError("ibG or ibR is missing probe intensities. need to filter them out.") if debug: print(f"ibG {len(ibG)} ibR {len(ibR)} oobG {len(oobG)} oobR {len(oobR)} | {debug_warnings}") # set minimum intensity to 1 ibG_affected = len(ibG.loc[ ibG['mean_value'] < 1 ].index) ibR_affected = len(ibR.loc[ ibR['mean_value'] < 1 ].index) ibG.loc[ ibG['mean_value'] < 1, 'mean_value'] = 1 ibR.loc[ ibR['mean_value'] < 1, 'mean_value'] = 1 oobG_affected = len(oobG[ oobG['mean_value'] < 1]) oobR_affected = len(oobR[ oobR['mean_value'] < 1]) oobG.loc[ oobG.mean_value < 1, 'mean_value'] = 1 oobR.loc[ oobR.mean_value < 1, 'mean_value'] = 1 if debug: if ibR_affected > 0 or ibR_affected > 0: print(f"ib: Set {ibR_affected} red and {ibG_affected} green to 1.0 ({len(ibR[ ibR['mean_value'] == 1 ].index)}, {len(ibG[ ibG['mean_value'] == 1 ].index)})") if oobG_affected > 0 or oobR_affected > 0: print(f"oob: Set {oobR_affected} red and {oobG_affected} green to 1.0 ({len(oobR[ oobR['mean_value'] == 1 ].index)}, {len(oobG[ oobG['mean_value'] == 1 ].index)})") # do background correction in each channel; returns "normalized in-band signal" ibG_nl, params_green = normexp_bg_corrected(ibG, oobG, offset, sample_name=container.sample.name) ibR_nl, params_red = normexp_bg_corrected(ibR, oobR, offset, sample_name=container.sample.name) noob_green = ibG_nl.round({'bg_corrected':0}) noob_red = ibR_nl.round({'bg_corrected':0}) if unit_test_oob: return { 'oobR': oobR, 'oobG': oobG, 'noob_green': noob_green, 'noob_red': noob_red, } # by default, this last step is omitted for sesame if nonlinear_dye_correction == True: # update() expects noob_red/green to have IlmnIDs in index, and contain bg_corrected for ALL probes. container.update_probe_means(noob_green, noob_red) elif nonlinear_dye_correction == False: # this "linear" method may be anologous to the ratio quantile normalization described in Nature: https://www.nature.com/articles/s41598-020-72664-6 normexp_bg_correct_control(container.ctrl_green, params_green) normexp_bg_correct_control(container.ctrl_red, params_red) mask_green = container.ctrl_green['Control_Type'].isin(ControlType.normalization_green()) mask_red = container.ctrl_red['Control_Type'].isin(ControlType.normalization_red()) avg_green = container.ctrl_green[mask_green]['bg_corrected'].mean() avg_red = container.ctrl_red[mask_red]['bg_corrected'].mean() rg_ratios = avg_red / avg_green red_factor = 1 / rg_ratios container.update_probe_means(noob_green, noob_red, red_factor) container._SigSet__minfi_noob = True elif nonlinear_dye_correction is None: if debug: LOGGER.info("skipping linear/nonlinear dye-bias correction step") # skips the minfi-linear step and won't trigger the sesame nonlinear dye bias step downstream, if you REALLY want it uncorrected. Mostly for debugging / benchmarking. container.update_probe_means(noob_green, noob_red)
class BackgroundCorrectionParams(): """ used in apply_bg_correction """ __slots__ = ( 'bg_mean', 'bg_mad', 'mean_signal', 'offset', ) def __init__(self, bg_mean, bg_mad, mean_signal, offset): # note: default offset was 15. In v1.3.3 (Jan 2020) I kept 15, after finding this made results match sesame's NOOB output exactly, if dye step ommitted. # offset is specified in the preprocess_noob() function. self.bg_mean = bg_mean self.bg_mad = bg_mad self.mean_signal = mean_signal self.offset = offset def normexp_bg_corrected(fg_probes, ctrl_probes, offset, sample_name=None): """ analogous to sesame's backgroundCorrectionNoobCh1 """ fg_means = fg_probes['mean_value'] if fg_means.min() == fg_means.max(): LOGGER.error(f"{sample_name}: min and max intensity are same. Sample probably bad.") params = BackgroundCorrectionParams(bg_mean=1.0, bg_mad=1.0, mean_signal=1.0, offset=15) fg_probes['bg_corrected'] = 1.0 return fg_probes, params fg_mean, _fg_mad = huber(fg_means) bg_mean, bg_mad = huber(ctrl_probes['mean_value']) mean_signal = np.maximum(fg_mean - bg_mean, 10) # "alpha" in sesame function params = BackgroundCorrectionParams(bg_mean, bg_mad, mean_signal, offset) corrected_signals = apply_bg_correction(fg_means, params) fg_probes['bg_corrected'] = corrected_signals fg_probes['bg_corrected'] = fg_probes['bg_corrected'].round(1) return fg_probes, params def normexp_bg_correct_control(control_probes, params): """Function for getting xcs controls for preprocessNoob""" control_means = control_probes['mean_value'] corrected_signals = apply_bg_correction(control_means, params) control_probes['bg_corrected'] = corrected_signals return control_probes def apply_bg_correction(mean_values, params): """ this function won't work with float16 in practice (underflow). limits use to float32 """ if not isinstance(params, BackgroundCorrectionParams): raise ValueError('params is not a BackgroundCorrectionParams instance') np.seterr(under='ignore') # 'raise to explore fixing underflow warning here' bg_mean = params.bg_mean #mu bg_mad = params.bg_mad #sigma mean_signal = params.mean_signal #alpha offset = params.offset mu_sf = mean_values - bg_mean - (bg_mad ** 2) / mean_signal #try: # signal_part_one = mu_sf + (bg_mad ** 2) # signal_part_two = np.exp(norm(mu_sf, bg_mad).logpdf(0) - norm(mu_sf, bg_mad).logsf(0)) # signal = signal_part_one * signal_part_two #except: # print(signal_part_one, norm(mu_sf, bg_mad).logpdf(0), norm(mu_sf, bg_mad).logsf(0)) # norm is from scipy.stats signal = mu_sf + (bg_mad ** 2) * np.exp(norm(mu_sf, bg_mad).logpdf(0) - norm(mu_sf, bg_mad).logsf(0)) """ COMPARE with sesame: signal <- mu.sf + sigma2 * exp( dnorm(0, mean = mu.sf, sd = sigma, log = TRUE) - pnorm( 0, mean = mu.sf, sd = sigma, lower.tail = FALSE, log.p = TRUE)) """ # sesame: "Limit of numerical accuracy reached with very low intensity or very high background: # setting adjusted intensities to small value" signal = np.maximum(signal, 1e-6) true_signal = signal + offset return true_signal def huber(vector): """Huber function. Designed to mirror MASS huber function in R Parameters ---------- vector: list list of float values Returns ------- local_median: float calculated mu value mad_scale: float calculated s value """ num_values = len(vector) positive_factor = 1.5 convergence_tol = 1.0e-6 mad_scale = robust.mad(vector) local_median = np.median(vector) init_local_median = local_median if not (local_median or mad_scale): return local_median, mad_scale while True: yy = np.minimum( np.maximum( local_median - positive_factor * mad_scale, vector, ), local_median + positive_factor * mad_scale, ) init_local_median = sum(yy) / num_values if abs(local_median - init_local_median) < convergence_tol * mad_scale: return local_median, mad_scale local_median = init_local_median def _apply_sesame_quality_mask(data_container): """ adapted from sesame's qualityMask function, which is applied just after poobah to remove probes Wanding thinks are sketchy. OUTPUT: this pandas DataFrame will have NaNs for probes to be excluded and 0.0 for probes to be retained. NaNs converted to 1.0 in final processing output. SESAME: masked <- sesameDataGet(paste0(sset@platform, '.probeInfo'))$mask to use TCGA masking, only applies to HM450 """ if data_container.array_type not in ( # ArrayType.ILLUMINA_27K, ArrayType.ILLUMINA_450K, ArrayType.ILLUMINA_EPIC, ArrayType.ILLUMINA_EPIC_PLUS, ArrayType.ILLUMINA_MOUSE): LOGGER.info(f"Quality masking is not supported for {data_container.array_type}.") return # load set of probes to remove from local file if data_container.array_type == ArrayType.ILLUMINA_450K: probes = qualityMask450 elif data_container.array_type == ArrayType.ILLUMINA_EPIC: probes = qualityMaskEPIC elif data_container.array_type == ArrayType.ILLUMINA_EPIC_PLUS: # this is a bit of a hack; probe names don't match epic, so I'm temporarily renaming, then filtering, then reverting. probes = qualityMaskEPICPLUS elif data_container.array_type == ArrayType.ILLUMINA_MOUSE: probes = qualityMaskmouse # v1.6+: the 1.0s are good probes and the 0.0 are probes to be excluded. cgs = pd.DataFrame( np.zeros((len(data_container.man.index), 1)), index=data_container.man.index, columns=['quality_mask']) cgs['quality_mask'] = 1.0 snps = pd.DataFrame( np.zeros((len(data_container.snp_man.index), 1)), index=data_container.snp_man.index, columns=['quality_mask']) snps['quality_mask'] = 1.0 df = pd.concat([cgs, snps]) df.loc[df.index.isin(probes), 'quality_mask'] = 0 #LOGGER.info(f"DEBUG quality_mask: {df.shape}, {df['quality_mask'].value_counts()} from {probes.shape} probes") return df """ ##### DEPRECATED (<v1.5.0) ##### def _old_reprocess_noob_sesame_v144(container, offset=15, debug=False): ''' NOOB pythonized copy of https://github.com/zwdzwd/sesame/blob/master/R/background_correction.R - The function takes a SigSet and returns a modified SigSet with that background subtracted. - Background is modelled in a normal distribution and true signal in an exponential distribution. - The Norm-Exp deconvolution is parameterized using Out-Of-Band (oob) probes. - includes snps, but not control probes yet - output should replace the container instead of returning debug dataframes - II RED and II GREEN both have data, but manifest doesn't have a way to track this, so function tracks it. ''' # get in-band red and green channel probe means #ibR <- c(IR(sset), II(sset)[,'U']) # in-band red signal = IR_meth + IR_unmeth + II[unmeth] #ibG <- c(IG(sset), II(sset)[,'M']) # in-band green signal = IG_meth + IG_unmeth + II[meth] # cols: mean_value, IlmnID, probe_type (I,II); index: illumina_id #CHECKED: AddressA or AddressB for each probe subtype matches probes.py raw = container.snp_methylated.data_frame snp_IR_meth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Red')][['mean_value','AddressB_ID']] .reset_index().rename(columns={'AddressB_ID':'illumina_id'}).set_index('illumina_id')) snp_IR_meth['Channel'] = 'Red' snp_IG_meth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Grn')][['mean_value','AddressB_ID']] .reset_index().rename(columns={'AddressB_ID':'illumina_id'}).set_index('illumina_id')) snp_IG_meth['Channel'] = 'Grn' snp_II_meth = (raw[(raw['Infinium_Design_Type'] == 'II')][['mean_value','AddressA_ID']] .reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id')) snp_II_meth['Channel'] = 'Grn' raw = container.snp_unmethylated.data_frame snp_IR_unmeth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Red')][['mean_value','AddressA_ID']] .reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id')) snp_IR_unmeth['Channel'] = 'Red' snp_IG_unmeth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Grn')][['mean_value','AddressA_ID']] .reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id')) snp_IG_unmeth['Channel'] = 'Grn' snp_II_unmeth = (raw[(raw['Infinium_Design_Type'] == 'II')][['mean_value','AddressA_ID']] .reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id')) snp_II_unmeth['Channel'] = 'Red' if debug: print('snp probes:', snp_IR_meth.shape, snp_IG_unmeth.shape, snp_II_meth.shape, snp_II_unmeth.shape) #--> copy over snps, but first get snps with illumina_id in index # swap index on all snps from IlmnID to illumina_id ## note: 350076 II + 89203 IR + 46298 IG = 485577 (including rs probes, but excl controls) ibG = container.fg_green # --> self.raw_dataset.get_fg_values(self.manifest, Channel.GREEN) ibG['Channel'] = 'Grn' ibG.index.name = 'illumina_id' ibR = container.fg_red # --> self.raw_dataset.get_fg_values(self.manifest, Channel.RED) ibR['Channel'] = 'Red' ibR.index.name = 'illumina_id' # to match sesame, extra probes are IR_unmeth and IG_unmeth in ibR red and ibG green, respectively. ibG = pd.concat([ibG, snp_IG_meth, snp_IG_unmeth, snp_II_meth ], sort=True).drop('probe_type', axis=1) # sort=True, because column order varies ibR = pd.concat([ibR, snp_IR_meth, snp_IR_unmeth, snp_II_unmeth ], sort=True).drop('probe_type', axis=1) if debug: print('in-bound Green:', ibG.shape) # green IG is AddressB, (meth) according to PROBE_SUBSETS print('in-bound Red:', ibR.shape) # red IR is AddressA (unmeth) according to PROBE_SUBSETS ### at this point, ibG ibR probe counts match sesame EXACTLY # set minimum intensity to 1 ibR_affected = len(ibR.loc[ ibR['mean_value'] < 1 ].index) ibG_affected = len(ibG.loc[ ibG['mean_value'] < 1 ].index) ibR.loc[ ibR['mean_value'] < 1, 'mean_value'] = 1 ibG.loc[ ibG['mean_value'] < 1, 'mean_value'] = 1 if debug: print(f"IB: Set {ibR_affected} red and {ibG_affected} green to 1.0 ({len(ibR[ ibR['mean_value'] == 1 ].index)}, {len(ibG[ ibG['mean_value'] == 1 ].index)})") red_dupes = len(ibR.index)-len(ibR.drop_duplicates().index) grn_dupes = len(ibG.index)-len(ibG.drop_duplicates().index) if debug and (red_dupes or grn_dupes): print(f"duplicate probes: {red_dupes} red and {grn_dupes} green") ref = container.manifest.data_frame # [['Infinium_Design_Type','Color_Channel']] # using a copy .oobG and .oobR here; does not update the idat or other source data probe_means # adopted from raw_dataset.filter_oob_probes here oobR = (container.oobR.merge(container.manifest.data_frame[['AddressB_ID']], how='left', left_index=True, right_index=True) .reset_index() .rename(columns={'AddressB_ID':'illumina_id', 'Unnamed: 0': 'IlmnID'}) .set_index('illumina_id') ) oobR = pd.DataFrame(list(oobR['meth']) + list(oobR['unmeth']), columns=['mean_value']) oobG = (container.oobG.merge(container.manifest.data_frame[['AddressA_ID']], how='left', left_index=True, right_index=True) .reset_index() .rename(columns={'AddressA_ID':'illumina_id', 'Unnamed: 0': 'IlmnID'}) .set_index('illumina_id') ) oobG = pd.DataFrame(list(oobG['meth']) + list(oobG['unmeth']), columns=['mean_value']) oobG_affected = len(oobG[ oobG['mean_value'] < 1]) oobG.loc[ oobG.mean_value < 1, 'mean_value'] = 1 oobR_affected = len(oobR[ oobR['mean_value'] < 1]) oobR.loc[ oobR.mean_value < 1, 'mean_value'] = 1 # here: do bg_subtract AND normalization step here ... ## do background correction in each channel; returns "normalized in-band signal" ibR_nl, params_red = normexp_bg_corrected(ibR, oobR, offset, sample_name=container.sample.name) #<- .backgroundCorrectionNoobCh1(ibR, oobR(sset), ctl(sset)$R, getBackgroundR(sset, bgR), offset=offset) ibG_nl, params_green = normexp_bg_corrected(ibG, oobG, offset, sample_name=container.sample.name) # <- .backgroundCorrectionNoobCh1(ibG, oobG(sset), ctl(sset)$G, getBackgroundG(sset, bgG), offset=offset) ibG_nl = ibG_nl.round({'bg_corrected':0}) ibR_nl = ibR_nl.round({'bg_corrected':0}) #print('ibG_nl', ibG_nl.shape) #print('ibR_nl', ibR_nl.shape) noob_green = ibG_nl noob_red = ibR_nl if debug: print(f"OOB: Set {oobR_affected} red and {oobG_affected} green to 1.0; shapes: {oobG.shape}, {oobR.shape}") print(f"noob_red with Grn: {len(noob_red[noob_red['Channel'] == 'Grn'])} noob_green with Red: {len(noob_green[noob_green['Channel'] == 'Red'])}") ref_IG = ref[(ref['Color_Channel']=='Grn') & (ref['Infinium_Design_Type']=='I')] ref_IR = ref[(ref['Color_Channel']=='Red') & (ref['Infinium_Design_Type']=='I')] ref_II = ref[ref['Infinium_Design_Type']=='II'] # II channel is NaN, but BOTH channels have data print(f"from manifest: ref_IG {ref_IG.shape} ref_IR {ref_IR.shape} ref_II {ref_II.shape}") # Combine and return red (IG + IR + II_unmeth) and green (IG + IR + II_meth) # ibR_nl has IlmnID and illumina_id (index); ref has IlmnID as index # ref_meth/ref_unmeth from probes.py ref_meth = pd.concat([ ref[(ref['Color_Channel'].isna()) & (ref['Infinium_Design_Type']=='II')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}), ref[(ref['Color_Channel']=='Grn') & (ref['Infinium_Design_Type']== 'I')]['AddressB_ID'].reset_index().rename(columns={'AddressB_ID':'illumina_id'}), ref[(ref['Color_Channel']=='Red') & (ref['Infinium_Design_Type']== 'I')]['AddressB_ID'].reset_index().rename(columns={'AddressB_ID':'illumina_id'}), ]) #.set_index('illumina_id') # .drop('illumina_id', axis=1) ref_unmeth = pd.concat([ ref[(ref['Color_Channel'].isna()) & (ref['Infinium_Design_Type']=='II')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}), ref[(ref['Color_Channel']=='Grn') & (ref['Infinium_Design_Type']== 'I')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}), ref[(ref['Color_Channel']=='Red') & (ref['Infinium_Design_Type']== 'I')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}), ]) #.set_index('illumina_id') # .drop('illumina_id', axis=1) noob_meth_G = noob_green[noob_green.index.isin(ref_meth['illumina_id'])] noob_unmeth_G = noob_green[noob_green.index.isin(ref_unmeth['illumina_id'])] noob_meth_R = noob_red[noob_red.index.isin(ref_meth['illumina_id'])] noob_unmeth_R = noob_red[noob_red.index.isin(ref_unmeth['illumina_id'])] noob_meth_dupes = pd.concat([noob_meth_G, noob_meth_R]) noob_unmeth_dupes = pd.concat([noob_unmeth_G, noob_unmeth_R]) # CONFIRMED: this dedupe method below matches sesame's output exactly for noob_meth noob_meth = (noob_meth_dupes[~noob_meth_dupes.index.duplicated(keep='first')] .set_index('IlmnID') .sort_index() .rename(columns={'bg_corrected':'meth'}) ) # conveniently, the FIRST value of each duplicate probe appears to be the one we want for both meth/unmeth R/G channels noob_unmeth = (noob_unmeth_dupes[~noob_unmeth_dupes.index.duplicated(keep='first')] .set_index('IlmnID') .sort_index() .rename(columns={'bg_corrected':'unmeth'}) ) # update II, IG, IR, oobR, oobG, ctrl_red, ctrl_green # --> --> probes.py subsets concatenate these: # fg_green # GREEN + AddressA + II # GREEN + AddressA + IG # GREEN + AddressB + IG # oob_green # RED + AddressA + IR # fg_red # RED + AddressA + II # RED + AddressA + IR # RED + AddressB + IR # oob_red # GREEN + AddressB + IG # # methylated # GREEN + AddressA + II # GREEN + AddressB + I # RED + AddressB + I # unmethylated # RED + AddressA + II # GREEN + AddressA + I # RED + AddressA + I # RETROFITTING BELOW -- may not work, as sesame works with noob_meth / noob_unmeth instead try: container.methylated.set_bg_corrected(noob_green, noob_red) container.unmethylated.set_bg_corrected(noob_green, noob_red) container.methylated.set_noob(1.0) container.unmethylated.set_noob(1.0) except ValueError as e: print(e) if debug: LOGGER.warning("could not update container methylated / unmethylated noob values, because preprocess_sesame_noob has already run once.") # output df should have sample meth or unmeth in a column with sample name and IlmnID as index. 485512 rows if debug: return { 'noob_meth': noob_meth, 'noob_unmeth': noob_unmeth, 'oobR': oobR, 'oobG': oobG, 'noob_green': noob_green, 'noob_red': noob_red, 'dupe_meth': noob_meth_dupes, 'dupe_unmeth': noob_unmeth_dupes, } return # noob_meth, noob_unmeth """