# Lib
import logging
from pathlib import Path
from urllib.parse import urljoin
import numpy as np
import pandas as pd
# App
from ..models import ArrayType, Channel, ProbeType
from ..utils import (
download_file,
get_file_object,
inner_join_data,
is_file_like,
reset_file,
)
__all__ = ['Manifest']
LOGGER = logging.getLogger(__name__)
MANIFEST_DIR_NAME = '.methylprep_manifest_files'
MANIFEST_DIR_PATH = f'~/{MANIFEST_DIR_NAME}'
MANIFEST_DIR_PATH_LAMBDA = f'/tmp/{MANIFEST_DIR_NAME}'
MANIFEST_BUCKET_NAME = 'array-manifest-files'
MANIFEST_REMOTE_PATH = f'https://s3.amazonaws.com/{MANIFEST_BUCKET_NAME}/'
ARRAY_FILENAME = {
'27k': 'hm27.hg19.manifest.csv.gz',
#'humanmethylation27_270596_v1-2.csv.gz',
'450k': 'HumanMethylation450k_15017482_v3.csv.gz',
#'HumanMethylation450_15017482_v1-2.CoreColumns.csv.gz',
'epic': 'HumanMethylationEPIC_manifest_v2.csv.gz',
#'MethylationEPIC_v-1-0_B4.CoreColumns.csv.gz',
'epic+': 'CombinedManifestEPIC_manifest_CoreColumns_v2.csv.gz',
#'CombinedManifestEPIC.manifest.CoreColumns.csv.gz',
'mouse': 'MM285_manifest_v3.csv.gz',
#'MM285_mm39_manifest_v2.csv.gz',
###### BE SURE TO ALSO UPDATE arrays.py ArrayType.num_controls if updating a manifest here. #######
}
ARRAY_TYPE_MANIFEST_FILENAMES = {
ArrayType.ILLUMINA_27K: ARRAY_FILENAME['27k'],
ArrayType.ILLUMINA_450K: ARRAY_FILENAME['450k'],
ArrayType.ILLUMINA_EPIC: ARRAY_FILENAME['epic'],
ArrayType.ILLUMINA_EPIC_PLUS: ARRAY_FILENAME['epic+'],
ArrayType.ILLUMINA_MOUSE: ARRAY_FILENAME['mouse'],
}
MANIFEST_COLUMNS = (
'IlmnID',
'AddressA_ID',
'AddressB_ID',
'Infinium_Design_Type',
'Color_Channel',
'Genome_Build',
'CHR',
'MAPINFO',
'Strand',
'OLD_Genome_Build',
'OLD_CHR',
'OLD_MAPINFO',
'OLD_Strand',
)
MOUSE_MANIFEST_COLUMNS = (
'IlmnID',
'AddressA_ID',
'AddressB_ID',
'Infinium_Design_Type',
'Color_Channel',
'design', # replaces Probe_Type in v1.4.6+ with tons of design meta data. only 'Random' and 'Multi' matter in code.
#'Probe_Type', # pre v1.4.6, needed to identify mouse-specific probes (mu) | and control probe sub_types
'Genome_Build',
'CHR',
'MAPINFO',
'Strand',
'OLD_Genome_Build',
'OLD_CHR',
'OLD_MAPINFO',
'OLD_Strand',
)
CONTROL_COLUMNS = (
'Address_ID',
'Control_Type',
'Color',
'Extended_Type',
# control probes don't have 'IlmnID' values set -- these probes are not locii specific
# these column names don't appear in manifest. they are added when importing the control section of rows
)
[docs]class Manifest():
"""Provides an object interface to an Illumina array manifest file.
Arguments:
array_type {ArrayType} -- The type of array to process.
values are styled like ArrayType.ILLUMINA_27K, ArrayType.ILLUMINA_EPIC or ArrayType('epic'), ArrayType('mouse')
Keyword Arguments:
filepath_or_buffer {file-like} -- a pre-existing manifest filepath (default: {None})
Raises:
ValueError: The sample sheet is not formatted properly or a sample cannot be found.
"""
__genome_df = None
__probe_type_subsets = None # apparently not used anywhere in methylprep
def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False, verbose=True):
array_str_to_class = dict(zip(list(ARRAY_FILENAME.keys()), list(ARRAY_TYPE_MANIFEST_FILENAMES.keys())))
if array_type in array_str_to_class:
array_type = array_str_to_class[array_type]
self.array_type = array_type
self.on_lambda = on_lambda # changes filepath to /tmp for the read-only file system
self.verbose = verbose
if filepath_or_buffer is None:
filepath_or_buffer = self.download_default(array_type, self.on_lambda)
with get_file_object(filepath_or_buffer) as manifest_file:
self.__data_frame = self.read_probes(manifest_file)
self.__control_data_frame = self.read_control_probes(manifest_file)
self.__snp_data_frame = self.read_snp_probes(manifest_file)
if self.array_type == ArrayType.ILLUMINA_MOUSE:
self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
else:
self.__mouse_data_frame = pd.DataFrame()
@property
def columns(self):
if self.array_type == ArrayType.ILLUMINA_MOUSE:
return MOUSE_MANIFEST_COLUMNS
else:
return MANIFEST_COLUMNS
@property
def data_frame(self):
return self.__data_frame
@property
def control_data_frame(self):
return self.__control_data_frame
@property
def snp_data_frame(self):
return self.__snp_data_frame
@property
def mouse_data_frame(self):
return self.__mouse_data_frame
[docs] @staticmethod
def download_default(array_type, on_lambda=False):
"""Downloads the appropriate manifest file if one does not already exist.
Arguments:
array_type {ArrayType} -- The type of array to process.
Returns:
[PurePath] -- Path to the manifest file.
"""
dir_path = Path(MANIFEST_DIR_PATH).expanduser()
if on_lambda:
dir_path = Path(MANIFEST_DIR_PATH_LAMBDA).expanduser()
filename = ARRAY_TYPE_MANIFEST_FILENAMES[array_type]
filepath = Path(dir_path).joinpath(filename)
if Path.exists(filepath):
return filepath
LOGGER.info(f"Downloading manifest: {Path(filename).stem}")
src_url = urljoin(MANIFEST_REMOTE_PATH, filename)
download_file(filename, src_url, dir_path)
return filepath
[docs] @staticmethod
def seek_to_start(manifest_file):
""" find the start of the data part of the manifest. first left-most column must be "IlmnID" to be found."""
reset_file(manifest_file)
current_pos = manifest_file.tell()
header_line = manifest_file.readline()
while not header_line.startswith(b'IlmnID'):
current_pos = manifest_file.tell()
if not header_line: #EOF
raise EOFError("The first (left-most) column in your manifest must contain 'IlmnID'. This defines the header row.")
header_line = manifest_file.readline()
if current_pos == 0:
manifest_file.seek(current_pos)
else:
manifest_file.seek(current_pos - 1)
[docs] def read_probes(self, manifest_file):
if self.verbose:
LOGGER.info(f'Reading manifest file: {Path(manifest_file.name).stem}')
try:
data_frame = pd.read_csv(
manifest_file,
comment='[',
dtype=self.get_data_types(),
usecols=self.columns,
nrows=self.array_type.num_probes,
# the -1 applies if the manifest has one extra row between the cg and control probes (a [Controls],,,,,, row) --- fixed in v1.5.6
index_col='IlmnID',
)
except ValueError:
optional = ['OLD_CHR', 'OLD_Strand', 'OLD_Genome_Build', 'OLD_MAPINFO']
use_columns = [col for col in self.columns if col not in optional]
data_frame = pd.read_csv(
manifest_file,
comment='[',
dtype=self.get_data_types(),
usecols=use_columns,
nrows=self.array_type.num_probes,
# the -1 applies if the manifest has one extra row between the cg and control probes (a [Controls],,,,,, row) --- fixed in v1.5.6
index_col='IlmnID',
)
LOGGER.info(f"Some optional genome mapping columns were not found in {Path(manifest_file.name).stem}")
# AddressB_ID in manifest includes NaNs and INTs and becomes floats, which breaks. forcing back here.
#data_frame['AddressB_ID'] = data_frame['AddressB_ID'].astype('Int64') # converts floats to ints; leaves NaNs inplace
# TURNS out, int or float both work for manifests. NOT the source of the error with mouse.
def get_probe_type(name, infinium_type):
"""returns one of (I, II, SnpI, SnpII, Control)
.from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
(starts with 'rs' == SnpI) and 'Control' is none of the above."""
probe_type = ProbeType.from_manifest_values(name, infinium_type)
return probe_type.value
vectorized_get_type = np.vectorize(get_probe_type)
data_frame['probe_type'] = vectorized_get_type(
data_frame.index.values,
data_frame['Infinium_Design_Type'].values,
)
return data_frame
[docs] def read_control_probes(self, manifest_file):
""" Unlike other probes, control probes have no IlmnID because they're not locus-specific.
they also use arbitrary columns, ignoring the header at start of manifest file. """
#LOGGER.info(f'Reading control probes: {Path(manifest_file.name).stem}')
self.seek_to_start(manifest_file)
return pd.read_csv(
manifest_file,
comment='[',
header=None,
index_col=0, # illumina_id, not IlmnID here
names=CONTROL_COLUMNS, # this gives these columns new names, because they have none. loading stuff at end of CSV after probes end.
nrows=self.array_type.num_controls,
skiprows=self.array_type.num_probes +1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
usecols=range(len(CONTROL_COLUMNS)),
)
[docs] def read_snp_probes(self, manifest_file):
""" Unlike cpg and control probes, these rs probes are NOT sequential in all arrays. """
#LOGGER.info(f'Reading snp probes: {Path(manifest_file.name).stem} --> {snp_df.shape[0]} found')
self.seek_to_start(manifest_file)
# since these are not sequential, loading everything and filtering by IlmnID.
snp_df = pd.read_csv(
manifest_file,
low_memory=False)
# 'O' type columns won't match in SigSet, so forcing float64 here. Also, float32 won't cover all probe IDs; must be float64.
snp_df = snp_df[snp_df['IlmnID'].str.match('rs', na=False)].astype({'AddressA_ID':'float64', 'AddressB_ID':'float64'})
return snp_df
[docs] def read_mouse_probes(self, manifest_file):
""" ILLUMINA_MOUSE contains unique probes whose names begin with 'mu' and 'rp'
for 'murine' and 'repeat', respectively. This creates a dataframe of these probes,
which are not processed like normal cg/ch probes. """
self.seek_to_start(manifest_file)
mouse_df = pd.read_csv(
manifest_file,
low_memory=False) # low_memory=Fase is required because control probes create mixed-types in columns.
#--- pre v1.4.6: mouse_df = mouse_df[(mouse_df['Probe_Type'] == 'rp') | (mouse_df['IlmnID'].str.startswith('uk', na=False)) | (mouse_df['Probe_Type'] == 'mu')]
#--- pre v1.4.6: 'mu' probes start with 'cg' instead and have 'mu' in Probe_Type column
mouse_df = mouse_df[(mouse_df['design'] == 'Multi') | (mouse_df['design'] == 'Random')]
return mouse_df
""" NEVER CALLED ANYWHERE - belongs in methylize
def map_to_genome(self, data_frame):
genome_df = self.get_genome_data()
merged_df = inner_join_data(data_frame, genome_df)
return merged_df
def get_genome_data(self, build=None):
if self.__genome_df is not None:
return self.__genome_df
LOGGER.info('Building genome data frame')
# new in version 1.5.6: support for both new and old genomes
GENOME_COLUMNS = (
'Genome_Build',
'CHR',
'MAPINFO',
'Strand',
) # PLUS four more optional columns with OLD_ prefix (for prev genome build)
genome_columns = GENOME_COLUMNS + ['OLD_'+col for col in GENOME_COLUMNS]
self.__genome_df = self.data_frame[genome_columns]
return self.__genome_df
"""
[docs] def get_data_types(self):
data_types = {
key: str for key in self.columns
}
data_types['AddressA_ID'] = 'Int64' #'float64' -- dtype found only in pandas 0.24 or greater
data_types['AddressB_ID'] = 'Int64' #'float64'
return data_types
[docs] def get_probe_details(self, probe_type, channel=None):
"""used by infer_channel_switch. Given a probe type (I, II, SnpI, SnpII, Control) and a channel (Channel.RED | Channel.GREEN),
this will return info needed to map probes to their names (e.g. cg0031313 or rs00542420), which are NOT in the idat files."""
if not isinstance(probe_type, ProbeType):
raise Exception('probe_type is not a valid ProbeType')
if channel and not isinstance(channel, Channel):
raise Exception('channel not a valid Channel')
data_frame = self.data_frame
probe_type_mask = data_frame['probe_type'].values == probe_type.value
if not channel:
return data_frame[probe_type_mask]
channel_mask = data_frame['Color_Channel'].values == channel.value
return data_frame[probe_type_mask & channel_mask]