# Normal-exponential using out-of-band probes
# normex: negative control probes
# noob: ‘out-of-band’ Infinium I probes
# Lib
import logging
import numpy as np
import pandas as pd
from statsmodels import robust
from scipy.stats import norm, lognorm
# App
from ..models import ControlType, ArrayType
from ..models.sketchy_probes import qualityMask450, qualityMaskEPIC, qualityMaskEPICPLUS, qualityMaskmouse
__all__ = ['preprocess_noob']
LOGGER = logging.getLogger(__name__)
[docs]def preprocess_noob(container, offset=15, pval_probes_df=None, quality_mask_df=None, nonlinear_dye_correction=True, debug=False, unit_test_oob=False): # v1.4.5+
""" NOOB pythonized copy of https://github.com/zwdzwd/sesame/blob/master/R/background_correction.R
- The function takes a SigSet and returns a modified SigSet with the background subtracted.
- Background is modelled in a normal distribution and true signal in an exponential distribution.
- The Norm-Exp deconvolution is parameterized using Out-Of-Band (oob) probes.
- includes snps, but not control probes yet
- output should replace the container instead of returning debug dataframes
- II RED and II GREEN both have data, but manifest doesn't have a way to track this, so function tracks it.
- keep IlmnID as index for meth/unmeth snps, and convert fg_green
if nonlinear_dye_correction=True, this uses a sesame method in place of minfi method, in a later step.
if unit_test_oob==True, returns the intermediate data instead of updating the SigSet/SampleDataContainer.
"""
if debug:
print(f"DEBUG NOOB {debug} nonlinear_dye_correction={nonlinear_dye_correction}, pval_probes_df={pval_probes_df.shape if isinstance(pval_probes_df,pd.DataFrame) else 'None'}, quality_mask_df={quality_mask_df.shape if isinstance(quality_mask_df,pd.DataFrame) else 'None'}")
# stack- need one long list of values, regardless of Meth/Uneth
ibG = pd.concat([
container.ibG.reset_index().rename(columns={'Meth': 'mean_value'}).assign(used='M'),
container.ibG.reset_index().rename(columns={'Unmeth': 'mean_value'}).assign(used='U')
])
ibG = ibG[ ~ibG['mean_value'].isna() ].drop(columns=['Meth','Unmeth'])
ibR = pd.concat([
container.ibR.reset_index().rename(columns={'Meth': 'mean_value'}).assign(used='M'), #.drop(columns=['Meth','Unmeth']),
container.ibR.reset_index().rename(columns={'Unmeth': 'mean_value'}).assign(used='U') #.drop(columns=['Meth','Unmeth'])
])
ibR = ibR[ ~ibR['mean_value'].isna() ].drop(columns=['Meth','Unmeth'])
# out-of-band is Green-Unmeth and Red-Meth
# exclude failing probes
pval = pval_probes_df.loc[ pval_probes_df['poobah_pval'] > container.poobah_sig ].index if isinstance(pval_probes_df, pd.DataFrame) else []
qmask = quality_mask_df.loc[ quality_mask_df['quality_mask'] == 0 ].index if isinstance(quality_mask_df, pd.DataFrame) else []
# the ignored errors here should only be from probes that are both pval failures and qmask failures.
Rmeth = list(container.oobR['Meth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore'))
Runmeth = list(container.oobR['Unmeth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore'))
oobR = pd.DataFrame( Rmeth + Runmeth, columns=['mean_value'])
Gmeth = list(container.oobG['Meth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore'))
Gunmeth = list(container.oobG['Unmeth'].drop(index=pval, errors='ignore').drop(index=qmask, errors='ignore'))
oobG = pd.DataFrame( Gmeth + Gunmeth, columns=['mean_value'])
# minfi test
# ref fg_green = 442614 | vs ibG 442672 = 396374 + 46240
# ref fg_red = 528410 | vs ibR 528482 = 439279 + 89131
# ref oob_green = 178374
# ref oob_red = 92578
#oobR = pd.DataFrame( data={'mean_value': container.oobR['Meth']})
#oobG = pd.DataFrame( data={'mean_value': container.oobG['Unmeth']})
#print(f" oobR {oobR.shape} oobG {oobG.shape}")
#import pdb;pdb.set_trace()
debug_warnings = ""
if oobR['mean_value'].isna().sum() > 0:
debug_warnings += f" NOOB: oobG had {oobG['mean_value'].isna().sum()} NaNs"
oobR = oobR.dropna()
if oobG['mean_value'].isna().sum() > 0:
debug_warnings += f" NOOB: oobG had {oobG['mean_value'].isna().sum()} NaNs"
oobG = oobG.dropna()
if ibG['mean_value'].isna().sum() > 0 or ibR['mean_value'].isna().sum() > 0:
raise ValueError("ibG or ibR is missing probe intensities. need to filter them out.")
if debug:
print(f"ibG {len(ibG)} ibR {len(ibR)} oobG {len(oobG)} oobR {len(oobR)} | {debug_warnings}")
# set minimum intensity to 1
ibG_affected = len(ibG.loc[ ibG['mean_value'] < 1 ].index)
ibR_affected = len(ibR.loc[ ibR['mean_value'] < 1 ].index)
ibG.loc[ ibG['mean_value'] < 1, 'mean_value'] = 1
ibR.loc[ ibR['mean_value'] < 1, 'mean_value'] = 1
oobG_affected = len(oobG[ oobG['mean_value'] < 1])
oobR_affected = len(oobR[ oobR['mean_value'] < 1])
oobG.loc[ oobG.mean_value < 1, 'mean_value'] = 1
oobR.loc[ oobR.mean_value < 1, 'mean_value'] = 1
if debug:
if ibR_affected > 0 or ibR_affected > 0:
print(f"ib: Set {ibR_affected} red and {ibG_affected} green to 1.0 ({len(ibR[ ibR['mean_value'] == 1 ].index)}, {len(ibG[ ibG['mean_value'] == 1 ].index)})")
if oobG_affected > 0 or oobR_affected > 0:
print(f"oob: Set {oobR_affected} red and {oobG_affected} green to 1.0 ({len(oobR[ oobR['mean_value'] == 1 ].index)}, {len(oobG[ oobG['mean_value'] == 1 ].index)})")
# do background correction in each channel; returns "normalized in-band signal"
ibG_nl, params_green = normexp_bg_corrected(ibG, oobG, offset, sample_name=container.sample.name)
ibR_nl, params_red = normexp_bg_corrected(ibR, oobR, offset, sample_name=container.sample.name)
noob_green = ibG_nl.round({'bg_corrected':0})
noob_red = ibR_nl.round({'bg_corrected':0})
if unit_test_oob:
return {
'oobR': oobR,
'oobG': oobG,
'noob_green': noob_green,
'noob_red': noob_red,
}
# by default, this last step is omitted for sesame
if nonlinear_dye_correction == True:
# update() expects noob_red/green to have IlmnIDs in index, and contain bg_corrected for ALL probes.
container.update_probe_means(noob_green, noob_red)
elif nonlinear_dye_correction == False:
# this "linear" method may be anologous to the ratio quantile normalization described in Nature: https://www.nature.com/articles/s41598-020-72664-6
normexp_bg_correct_control(container.ctrl_green, params_green)
normexp_bg_correct_control(container.ctrl_red, params_red)
mask_green = container.ctrl_green['Control_Type'].isin(ControlType.normalization_green())
mask_red = container.ctrl_red['Control_Type'].isin(ControlType.normalization_red())
avg_green = container.ctrl_green[mask_green]['bg_corrected'].mean()
avg_red = container.ctrl_red[mask_red]['bg_corrected'].mean()
rg_ratios = avg_red / avg_green
red_factor = 1 / rg_ratios
container.update_probe_means(noob_green, noob_red, red_factor)
container._SigSet__minfi_noob = True
elif nonlinear_dye_correction is None:
if debug:
LOGGER.info("skipping linear/nonlinear dye-bias correction step")
# skips the minfi-linear step and won't trigger the sesame nonlinear dye bias step downstream, if you REALLY want it uncorrected. Mostly for debugging / benchmarking.
container.update_probe_means(noob_green, noob_red)
class BackgroundCorrectionParams():
""" used in apply_bg_correction """
__slots__ = (
'bg_mean',
'bg_mad',
'mean_signal',
'offset',
)
def __init__(self, bg_mean, bg_mad, mean_signal, offset):
# note: default offset was 15. In v1.3.3 (Jan 2020) I kept 15, after finding this made results match sesame's NOOB output exactly, if dye step ommitted.
# offset is specified in the preprocess_noob() function.
self.bg_mean = bg_mean
self.bg_mad = bg_mad
self.mean_signal = mean_signal
self.offset = offset
def normexp_bg_corrected(fg_probes, ctrl_probes, offset, sample_name=None):
""" analogous to sesame's backgroundCorrectionNoobCh1 """
fg_means = fg_probes['mean_value']
if fg_means.min() == fg_means.max():
LOGGER.error(f"{sample_name}: min and max intensity are same. Sample probably bad.")
params = BackgroundCorrectionParams(bg_mean=1.0, bg_mad=1.0, mean_signal=1.0, offset=15)
fg_probes['bg_corrected'] = 1.0
return fg_probes, params
fg_mean, _fg_mad = huber(fg_means)
bg_mean, bg_mad = huber(ctrl_probes['mean_value'])
mean_signal = np.maximum(fg_mean - bg_mean, 10) # "alpha" in sesame function
params = BackgroundCorrectionParams(bg_mean, bg_mad, mean_signal, offset)
corrected_signals = apply_bg_correction(fg_means, params)
fg_probes['bg_corrected'] = corrected_signals
fg_probes['bg_corrected'] = fg_probes['bg_corrected'].round(1)
return fg_probes, params
def normexp_bg_correct_control(control_probes, params):
"""Function for getting xcs controls for preprocessNoob"""
control_means = control_probes['mean_value']
corrected_signals = apply_bg_correction(control_means, params)
control_probes['bg_corrected'] = corrected_signals
return control_probes
def apply_bg_correction(mean_values, params):
""" this function won't work with float16 in practice (underflow). limits use to float32 """
if not isinstance(params, BackgroundCorrectionParams):
raise ValueError('params is not a BackgroundCorrectionParams instance')
np.seterr(under='ignore') # 'raise to explore fixing underflow warning here'
bg_mean = params.bg_mean #mu
bg_mad = params.bg_mad #sigma
mean_signal = params.mean_signal #alpha
offset = params.offset
mu_sf = mean_values - bg_mean - (bg_mad ** 2) / mean_signal
#try:
# signal_part_one = mu_sf + (bg_mad ** 2)
# signal_part_two = np.exp(norm(mu_sf, bg_mad).logpdf(0) - norm(mu_sf, bg_mad).logsf(0))
# signal = signal_part_one * signal_part_two
#except:
# print(signal_part_one, norm(mu_sf, bg_mad).logpdf(0), norm(mu_sf, bg_mad).logsf(0))
# norm is from scipy.stats
signal = mu_sf + (bg_mad ** 2) * np.exp(norm(mu_sf, bg_mad).logpdf(0) - norm(mu_sf, bg_mad).logsf(0))
""" COMPARE with sesame:
signal <- mu.sf + sigma2 * exp(
dnorm(0, mean = mu.sf, sd = sigma, log = TRUE) -
pnorm(
0, mean = mu.sf, sd = sigma,
lower.tail = FALSE, log.p = TRUE))
"""
# sesame: "Limit of numerical accuracy reached with very low intensity or very high background:
# setting adjusted intensities to small value"
signal = np.maximum(signal, 1e-6)
true_signal = signal + offset
return true_signal
def huber(vector):
"""Huber function. Designed to mirror MASS huber function in R
Parameters
----------
vector: list
list of float values
Returns
-------
local_median: float
calculated mu value
mad_scale: float
calculated s value
"""
num_values = len(vector)
positive_factor = 1.5
convergence_tol = 1.0e-6
mad_scale = robust.mad(vector)
local_median = np.median(vector)
init_local_median = local_median
if not (local_median or mad_scale):
return local_median, mad_scale
while True:
yy = np.minimum(
np.maximum(
local_median - positive_factor * mad_scale,
vector,
),
local_median + positive_factor * mad_scale,
)
init_local_median = sum(yy) / num_values
if abs(local_median - init_local_median) < convergence_tol * mad_scale:
return local_median, mad_scale
local_median = init_local_median
def _apply_sesame_quality_mask(data_container):
""" adapted from sesame's qualityMask function, which is applied just after poobah
to remove probes Wanding thinks are sketchy.
OUTPUT: this pandas DataFrame will have NaNs for probes to be excluded and 0.0 for probes to be retained. NaNs converted to 1.0 in final processing output.
SESAME:
masked <- sesameDataGet(paste0(sset@platform, '.probeInfo'))$mask
to use TCGA masking, only applies to HM450
"""
if data_container.array_type not in (
# ArrayType.ILLUMINA_27K,
ArrayType.ILLUMINA_450K,
ArrayType.ILLUMINA_EPIC,
ArrayType.ILLUMINA_EPIC_PLUS,
ArrayType.ILLUMINA_MOUSE):
LOGGER.info(f"Quality masking is not supported for {data_container.array_type}.")
return
# load set of probes to remove from local file
if data_container.array_type == ArrayType.ILLUMINA_450K:
probes = qualityMask450
elif data_container.array_type == ArrayType.ILLUMINA_EPIC:
probes = qualityMaskEPIC
elif data_container.array_type == ArrayType.ILLUMINA_EPIC_PLUS:
# this is a bit of a hack; probe names don't match epic, so I'm temporarily renaming, then filtering, then reverting.
probes = qualityMaskEPICPLUS
elif data_container.array_type == ArrayType.ILLUMINA_MOUSE:
probes = qualityMaskmouse
# v1.6+: the 1.0s are good probes and the 0.0 are probes to be excluded.
cgs = pd.DataFrame( np.zeros((len(data_container.man.index), 1)), index=data_container.man.index, columns=['quality_mask'])
cgs['quality_mask'] = 1.0
snps = pd.DataFrame( np.zeros((len(data_container.snp_man.index), 1)), index=data_container.snp_man.index, columns=['quality_mask'])
snps['quality_mask'] = 1.0
df = pd.concat([cgs, snps])
df.loc[df.index.isin(probes), 'quality_mask'] = 0
#LOGGER.info(f"DEBUG quality_mask: {df.shape}, {df['quality_mask'].value_counts()} from {probes.shape} probes")
return df
""" ##### DEPRECATED (<v1.5.0) #####
def _old_reprocess_noob_sesame_v144(container, offset=15, debug=False):
''' NOOB pythonized copy of https://github.com/zwdzwd/sesame/blob/master/R/background_correction.R
- The function takes a SigSet and returns a modified SigSet with that background subtracted.
- Background is modelled in a normal distribution and true signal in an exponential distribution.
- The Norm-Exp deconvolution is parameterized using Out-Of-Band (oob) probes.
- includes snps, but not control probes yet
- output should replace the container instead of returning debug dataframes
- II RED and II GREEN both have data, but manifest doesn't have a way to track this, so function tracks it.
'''
# get in-band red and green channel probe means
#ibR <- c(IR(sset), II(sset)[,'U']) # in-band red signal = IR_meth + IR_unmeth + II[unmeth]
#ibG <- c(IG(sset), II(sset)[,'M']) # in-band green signal = IG_meth + IG_unmeth + II[meth]
# cols: mean_value, IlmnID, probe_type (I,II); index: illumina_id
#CHECKED: AddressA or AddressB for each probe subtype matches probes.py
raw = container.snp_methylated.data_frame
snp_IR_meth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Red')][['mean_value','AddressB_ID']]
.reset_index().rename(columns={'AddressB_ID':'illumina_id'}).set_index('illumina_id'))
snp_IR_meth['Channel'] = 'Red'
snp_IG_meth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Grn')][['mean_value','AddressB_ID']]
.reset_index().rename(columns={'AddressB_ID':'illumina_id'}).set_index('illumina_id'))
snp_IG_meth['Channel'] = 'Grn'
snp_II_meth = (raw[(raw['Infinium_Design_Type'] == 'II')][['mean_value','AddressA_ID']]
.reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id'))
snp_II_meth['Channel'] = 'Grn'
raw = container.snp_unmethylated.data_frame
snp_IR_unmeth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Red')][['mean_value','AddressA_ID']]
.reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id'))
snp_IR_unmeth['Channel'] = 'Red'
snp_IG_unmeth = (raw[(raw['Infinium_Design_Type'] == 'I') & (raw['Color_Channel'] == 'Grn')][['mean_value','AddressA_ID']]
.reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id'))
snp_IG_unmeth['Channel'] = 'Grn'
snp_II_unmeth = (raw[(raw['Infinium_Design_Type'] == 'II')][['mean_value','AddressA_ID']]
.reset_index().rename(columns={'AddressA_ID':'illumina_id'}).set_index('illumina_id'))
snp_II_unmeth['Channel'] = 'Red'
if debug:
print('snp probes:', snp_IR_meth.shape, snp_IG_unmeth.shape, snp_II_meth.shape, snp_II_unmeth.shape)
#--> copy over snps, but first get snps with illumina_id in index
# swap index on all snps from IlmnID to illumina_id
## note: 350076 II + 89203 IR + 46298 IG = 485577 (including rs probes, but excl controls)
ibG = container.fg_green # --> self.raw_dataset.get_fg_values(self.manifest, Channel.GREEN)
ibG['Channel'] = 'Grn'
ibG.index.name = 'illumina_id'
ibR = container.fg_red # --> self.raw_dataset.get_fg_values(self.manifest, Channel.RED)
ibR['Channel'] = 'Red'
ibR.index.name = 'illumina_id'
# to match sesame, extra probes are IR_unmeth and IG_unmeth in ibR red and ibG green, respectively.
ibG = pd.concat([ibG,
snp_IG_meth,
snp_IG_unmeth,
snp_II_meth
], sort=True).drop('probe_type', axis=1)
# sort=True, because column order varies
ibR = pd.concat([ibR,
snp_IR_meth,
snp_IR_unmeth,
snp_II_unmeth
], sort=True).drop('probe_type', axis=1)
if debug:
print('in-bound Green:', ibG.shape) # green IG is AddressB, (meth) according to PROBE_SUBSETS
print('in-bound Red:', ibR.shape) # red IR is AddressA (unmeth) according to PROBE_SUBSETS
### at this point, ibG ibR probe counts match sesame EXACTLY
# set minimum intensity to 1
ibR_affected = len(ibR.loc[ ibR['mean_value'] < 1 ].index)
ibG_affected = len(ibG.loc[ ibG['mean_value'] < 1 ].index)
ibR.loc[ ibR['mean_value'] < 1, 'mean_value'] = 1
ibG.loc[ ibG['mean_value'] < 1, 'mean_value'] = 1
if debug:
print(f"IB: Set {ibR_affected} red and {ibG_affected} green to 1.0 ({len(ibR[ ibR['mean_value'] == 1 ].index)}, {len(ibG[ ibG['mean_value'] == 1 ].index)})")
red_dupes = len(ibR.index)-len(ibR.drop_duplicates().index)
grn_dupes = len(ibG.index)-len(ibG.drop_duplicates().index)
if debug and (red_dupes or grn_dupes):
print(f"duplicate probes: {red_dupes} red and {grn_dupes} green")
ref = container.manifest.data_frame # [['Infinium_Design_Type','Color_Channel']]
# using a copy .oobG and .oobR here; does not update the idat or other source data probe_means
# adopted from raw_dataset.filter_oob_probes here
oobR = (container.oobR.merge(container.manifest.data_frame[['AddressB_ID']],
how='left',
left_index=True,
right_index=True)
.reset_index()
.rename(columns={'AddressB_ID':'illumina_id', 'Unnamed: 0': 'IlmnID'})
.set_index('illumina_id')
)
oobR = pd.DataFrame(list(oobR['meth']) + list(oobR['unmeth']), columns=['mean_value'])
oobG = (container.oobG.merge(container.manifest.data_frame[['AddressA_ID']],
how='left',
left_index=True,
right_index=True)
.reset_index()
.rename(columns={'AddressA_ID':'illumina_id', 'Unnamed: 0': 'IlmnID'})
.set_index('illumina_id')
)
oobG = pd.DataFrame(list(oobG['meth']) + list(oobG['unmeth']), columns=['mean_value'])
oobG_affected = len(oobG[ oobG['mean_value'] < 1])
oobG.loc[ oobG.mean_value < 1, 'mean_value'] = 1
oobR_affected = len(oobR[ oobR['mean_value'] < 1])
oobR.loc[ oobR.mean_value < 1, 'mean_value'] = 1
# here: do bg_subtract AND normalization step here ...
## do background correction in each channel; returns "normalized in-band signal"
ibR_nl, params_red = normexp_bg_corrected(ibR, oobR, offset, sample_name=container.sample.name)
#<- .backgroundCorrectionNoobCh1(ibR, oobR(sset), ctl(sset)$R, getBackgroundR(sset, bgR), offset=offset)
ibG_nl, params_green = normexp_bg_corrected(ibG, oobG, offset, sample_name=container.sample.name)
# <- .backgroundCorrectionNoobCh1(ibG, oobG(sset), ctl(sset)$G, getBackgroundG(sset, bgG), offset=offset)
ibG_nl = ibG_nl.round({'bg_corrected':0})
ibR_nl = ibR_nl.round({'bg_corrected':0})
#print('ibG_nl', ibG_nl.shape)
#print('ibR_nl', ibR_nl.shape)
noob_green = ibG_nl
noob_red = ibR_nl
if debug:
print(f"OOB: Set {oobR_affected} red and {oobG_affected} green to 1.0; shapes: {oobG.shape}, {oobR.shape}")
print(f"noob_red with Grn: {len(noob_red[noob_red['Channel'] == 'Grn'])} noob_green with Red: {len(noob_green[noob_green['Channel'] == 'Red'])}")
ref_IG = ref[(ref['Color_Channel']=='Grn') & (ref['Infinium_Design_Type']=='I')]
ref_IR = ref[(ref['Color_Channel']=='Red') & (ref['Infinium_Design_Type']=='I')]
ref_II = ref[ref['Infinium_Design_Type']=='II'] # II channel is NaN, but BOTH channels have data
print(f"from manifest: ref_IG {ref_IG.shape} ref_IR {ref_IR.shape} ref_II {ref_II.shape}")
# Combine and return red (IG + IR + II_unmeth) and green (IG + IR + II_meth)
# ibR_nl has IlmnID and illumina_id (index); ref has IlmnID as index
# ref_meth/ref_unmeth from probes.py
ref_meth = pd.concat([
ref[(ref['Color_Channel'].isna()) & (ref['Infinium_Design_Type']=='II')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}),
ref[(ref['Color_Channel']=='Grn') & (ref['Infinium_Design_Type']== 'I')]['AddressB_ID'].reset_index().rename(columns={'AddressB_ID':'illumina_id'}),
ref[(ref['Color_Channel']=='Red') & (ref['Infinium_Design_Type']== 'I')]['AddressB_ID'].reset_index().rename(columns={'AddressB_ID':'illumina_id'}),
]) #.set_index('illumina_id') # .drop('illumina_id', axis=1)
ref_unmeth = pd.concat([
ref[(ref['Color_Channel'].isna()) & (ref['Infinium_Design_Type']=='II')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}),
ref[(ref['Color_Channel']=='Grn') & (ref['Infinium_Design_Type']== 'I')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}),
ref[(ref['Color_Channel']=='Red') & (ref['Infinium_Design_Type']== 'I')]['AddressA_ID'].reset_index().rename(columns={'AddressA_ID':'illumina_id'}),
]) #.set_index('illumina_id') # .drop('illumina_id', axis=1)
noob_meth_G = noob_green[noob_green.index.isin(ref_meth['illumina_id'])]
noob_unmeth_G = noob_green[noob_green.index.isin(ref_unmeth['illumina_id'])]
noob_meth_R = noob_red[noob_red.index.isin(ref_meth['illumina_id'])]
noob_unmeth_R = noob_red[noob_red.index.isin(ref_unmeth['illumina_id'])]
noob_meth_dupes = pd.concat([noob_meth_G, noob_meth_R])
noob_unmeth_dupes = pd.concat([noob_unmeth_G, noob_unmeth_R])
# CONFIRMED: this dedupe method below matches sesame's output exactly for noob_meth
noob_meth = (noob_meth_dupes[~noob_meth_dupes.index.duplicated(keep='first')]
.set_index('IlmnID')
.sort_index()
.rename(columns={'bg_corrected':'meth'})
)
# conveniently, the FIRST value of each duplicate probe appears to be the one we want for both meth/unmeth R/G channels
noob_unmeth = (noob_unmeth_dupes[~noob_unmeth_dupes.index.duplicated(keep='first')]
.set_index('IlmnID')
.sort_index()
.rename(columns={'bg_corrected':'unmeth'})
)
# update II, IG, IR, oobR, oobG, ctrl_red, ctrl_green
# --> --> probes.py subsets concatenate these:
# fg_green
# GREEN + AddressA + II
# GREEN + AddressA + IG
# GREEN + AddressB + IG
# oob_green
# RED + AddressA + IR
# fg_red
# RED + AddressA + II
# RED + AddressA + IR
# RED + AddressB + IR
# oob_red
# GREEN + AddressB + IG
#
# methylated
# GREEN + AddressA + II
# GREEN + AddressB + I
# RED + AddressB + I
# unmethylated
# RED + AddressA + II
# GREEN + AddressA + I
# RED + AddressA + I
# RETROFITTING BELOW -- may not work, as sesame works with noob_meth / noob_unmeth instead
try:
container.methylated.set_bg_corrected(noob_green, noob_red)
container.unmethylated.set_bg_corrected(noob_green, noob_red)
container.methylated.set_noob(1.0)
container.unmethylated.set_noob(1.0)
except ValueError as e:
print(e)
if debug:
LOGGER.warning("could not update container methylated / unmethylated noob values, because preprocess_sesame_noob has already run once.")
# output df should have sample meth or unmeth in a column with sample name and IlmnID as index. 485512 rows
if debug:
return {
'noob_meth': noob_meth,
'noob_unmeth': noob_unmeth,
'oobR': oobR,
'oobG': oobG,
'noob_green': noob_green,
'noob_red': noob_red,
'dupe_meth': noob_meth_dupes,
'dupe_unmeth': noob_unmeth_dupes,
}
return # noob_meth, noob_unmeth
"""