Source code for methylprep.files.manifests

# Lib
import logging
from pathlib import Path
from urllib.parse import urljoin
import numpy as np
import pandas as pd
# App
from ..models import ArrayType, Channel, ProbeType
from ..utils import (
    download_file,
    get_file_object,
    inner_join_data,
    is_file_like,
    reset_file,
)


__all__ = ['Manifest']


LOGGER = logging.getLogger(__name__)

MANIFEST_DIR_NAME = '.methylprep_manifest_files'
MANIFEST_DIR_PATH = f'~/{MANIFEST_DIR_NAME}'
MANIFEST_DIR_PATH_LAMBDA = f'/tmp/{MANIFEST_DIR_NAME}'
MANIFEST_BUCKET_NAME = 'array-manifest-files'
MANIFEST_REMOTE_PATH = f'https://s3.amazonaws.com/{MANIFEST_BUCKET_NAME}/'

ARRAY_FILENAME = {
    '27k': 'hm27.hg19.manifest.csv.gz',
    #'humanmethylation27_270596_v1-2.csv.gz',
    '450k': 'HumanMethylation450k_15017482_v3.csv.gz',
    #'HumanMethylation450_15017482_v1-2.CoreColumns.csv.gz',
    'epic': 'HumanMethylationEPIC_manifest_v2.csv.gz',
    #'MethylationEPIC_v-1-0_B4.CoreColumns.csv.gz',
    'epic+': 'CombinedManifestEPIC_manifest_CoreColumns_v2.csv.gz',
    #'CombinedManifestEPIC.manifest.CoreColumns.csv.gz',
    'mouse': 'MM285_manifest_v3.csv.gz',
    #'MM285_mm39_manifest_v2.csv.gz',
    ###### BE SURE TO ALSO UPDATE arrays.py ArrayType.num_controls if updating a manifest here. #######
}
ARRAY_TYPE_MANIFEST_FILENAMES = {
    ArrayType.ILLUMINA_27K: ARRAY_FILENAME['27k'],
    ArrayType.ILLUMINA_450K: ARRAY_FILENAME['450k'],
    ArrayType.ILLUMINA_EPIC: ARRAY_FILENAME['epic'],
    ArrayType.ILLUMINA_EPIC_PLUS: ARRAY_FILENAME['epic+'],
    ArrayType.ILLUMINA_MOUSE: ARRAY_FILENAME['mouse'],
}
MANIFEST_COLUMNS = (
    'IlmnID',
    'AddressA_ID',
    'AddressB_ID',
    'Infinium_Design_Type',
    'Color_Channel',
    'Genome_Build',
    'CHR',
    'MAPINFO',
    'Strand',
    'OLD_Genome_Build',
    'OLD_CHR',
    'OLD_MAPINFO',
    'OLD_Strand',
)

MOUSE_MANIFEST_COLUMNS = (
    'IlmnID',
    'AddressA_ID',
    'AddressB_ID',
    'Infinium_Design_Type',
    'Color_Channel',
    'design', # replaces Probe_Type in v1.4.6+ with tons of design meta data. only 'Random' and 'Multi' matter in code.
    #'Probe_Type', # pre v1.4.6, needed to identify mouse-specific probes (mu) | and control probe sub_types
    'Genome_Build',
    'CHR',
    'MAPINFO',
    'Strand',
    'OLD_Genome_Build',
    'OLD_CHR',
    'OLD_MAPINFO',
    'OLD_Strand',
)

CONTROL_COLUMNS = (
    'Address_ID',
    'Control_Type',
    'Color',
    'Extended_Type',
    # control probes don't have 'IlmnID' values set -- these probes are not locii specific
    # these column names don't appear in manifest. they are added when importing the control section of rows
)


[docs]class Manifest():
    """Provides an object interface to an Illumina array manifest file.

    Arguments:
        array_type {ArrayType} -- The type of array to process.
        values are styled like ArrayType.ILLUMINA_27K, ArrayType.ILLUMINA_EPIC or ArrayType('epic'), ArrayType('mouse')

    Keyword Arguments:
        filepath_or_buffer {file-like} -- a pre-existing manifest filepath (default: {None})

    Raises:
        ValueError: The sample sheet is not formatted properly or a sample cannot be found.
    """

    __genome_df = None
    __probe_type_subsets = None # apparently not used anywhere in methylprep

    def __init__(self, array_type, filepath_or_buffer=None, on_lambda=False, verbose=True):
        array_str_to_class = dict(zip(list(ARRAY_FILENAME.keys()), list(ARRAY_TYPE_MANIFEST_FILENAMES.keys())))
        if array_type in array_str_to_class:
            array_type = array_str_to_class[array_type]
        self.array_type = array_type
        self.on_lambda = on_lambda # changes filepath to /tmp for the read-only file system
        self.verbose = verbose

        if filepath_or_buffer is None:
            filepath_or_buffer = self.download_default(array_type, self.on_lambda)

        with get_file_object(filepath_or_buffer) as manifest_file:
            self.__data_frame = self.read_probes(manifest_file)
            self.__control_data_frame = self.read_control_probes(manifest_file)
            self.__snp_data_frame = self.read_snp_probes(manifest_file)
            if self.array_type == ArrayType.ILLUMINA_MOUSE:
                self.__mouse_data_frame = self.read_mouse_probes(manifest_file)
            else:
                self.__mouse_data_frame = pd.DataFrame()

    @property
    def columns(self):
        if self.array_type == ArrayType.ILLUMINA_MOUSE:
            return MOUSE_MANIFEST_COLUMNS
        else:
            return MANIFEST_COLUMNS

    @property
    def data_frame(self):
        return self.__data_frame

    @property
    def control_data_frame(self):
        return self.__control_data_frame

    @property
    def snp_data_frame(self):
        return self.__snp_data_frame

    @property
    def mouse_data_frame(self):
        return self.__mouse_data_frame

[docs]    @staticmethod
    def download_default(array_type, on_lambda=False):
        """Downloads the appropriate manifest file if one does not already exist.

        Arguments:
            array_type {ArrayType} -- The type of array to process.

        Returns:
            [PurePath] -- Path to the manifest file.
        """
        dir_path = Path(MANIFEST_DIR_PATH).expanduser()
        if on_lambda:
            dir_path = Path(MANIFEST_DIR_PATH_LAMBDA).expanduser()
        filename = ARRAY_TYPE_MANIFEST_FILENAMES[array_type]
        filepath = Path(dir_path).joinpath(filename)

        if Path.exists(filepath):
            return filepath

        LOGGER.info(f"Downloading manifest: {Path(filename).stem}")
        src_url = urljoin(MANIFEST_REMOTE_PATH, filename)
        download_file(filename, src_url, dir_path)

        return filepath

[docs]    @staticmethod
    def seek_to_start(manifest_file):
        """ find the start of the data part of the manifest. first left-most column must be "IlmnID" to be found."""
        reset_file(manifest_file)

        current_pos = manifest_file.tell()
        header_line = manifest_file.readline()

        while not header_line.startswith(b'IlmnID'):
            current_pos = manifest_file.tell()
            if not header_line: #EOF
                raise EOFError("The first (left-most) column in your manifest must contain 'IlmnID'. This defines the header row.")
            header_line = manifest_file.readline()

        if current_pos == 0:
            manifest_file.seek(current_pos)
        else:
            manifest_file.seek(current_pos - 1)

[docs]    def read_probes(self, manifest_file):
        if self.verbose:
            LOGGER.info(f'Reading manifest file: {Path(manifest_file.name).stem}')

        try:
            data_frame = pd.read_csv(
                manifest_file,
                comment='[',
                dtype=self.get_data_types(),
                usecols=self.columns,
                nrows=self.array_type.num_probes,
                # the -1 applies if the manifest has one extra row between the cg and control probes (a [Controls],,,,,, row) --- fixed in v1.5.6
                index_col='IlmnID',
            )
        except ValueError:
            optional = ['OLD_CHR', 'OLD_Strand', 'OLD_Genome_Build', 'OLD_MAPINFO']
            use_columns = [col for col in self.columns if col not in optional]
            data_frame = pd.read_csv(
                manifest_file,
                comment='[',
                dtype=self.get_data_types(),
                usecols=use_columns,
                nrows=self.array_type.num_probes,
                # the -1 applies if the manifest has one extra row between the cg and control probes (a [Controls],,,,,, row) --- fixed in v1.5.6
                index_col='IlmnID',
            )
            LOGGER.info(f"Some optional genome mapping columns were not found in {Path(manifest_file.name).stem}")
        # AddressB_ID in manifest includes NaNs and INTs and becomes floats, which breaks. forcing back here.
        #data_frame['AddressB_ID'] = data_frame['AddressB_ID'].astype('Int64') # converts floats to ints; leaves NaNs inplace
        # TURNS out, int or float both work for manifests. NOT the source of the error with mouse.

        def get_probe_type(name, infinium_type):
            """returns one of (I, II, SnpI, SnpII, Control)

            .from_manifest_values() returns probe type using either the Infinium_Design_Type (I or II) or the name
            (starts with 'rs' == SnpI) and 'Control' is none of the above."""
            probe_type = ProbeType.from_manifest_values(name, infinium_type)
            return probe_type.value

        vectorized_get_type = np.vectorize(get_probe_type)
        data_frame['probe_type'] = vectorized_get_type(
            data_frame.index.values,
            data_frame['Infinium_Design_Type'].values,
        )
        return data_frame

[docs]    def read_control_probes(self, manifest_file):
        """ Unlike other probes, control probes have no IlmnID because they're not locus-specific.
        they also use arbitrary columns, ignoring the header at start of manifest file. """
        #LOGGER.info(f'Reading control probes: {Path(manifest_file.name).stem}')

        self.seek_to_start(manifest_file)

        return pd.read_csv(
            manifest_file,
            comment='[',
            header=None,
            index_col=0, # illumina_id, not IlmnID here
            names=CONTROL_COLUMNS, # this gives these columns new names, because they have none. loading stuff at end of CSV after probes end.
            nrows=self.array_type.num_controls,
            skiprows=self.array_type.num_probes +1, #without the +1, it includes the last cpg probe in controls and breaks stuff.
            usecols=range(len(CONTROL_COLUMNS)),
        )

[docs]    def read_snp_probes(self, manifest_file):
        """ Unlike cpg and control probes, these rs probes are NOT sequential in all arrays. """
        #LOGGER.info(f'Reading snp probes: {Path(manifest_file.name).stem} --> {snp_df.shape[0]} found')
        self.seek_to_start(manifest_file)
        # since these are not sequential, loading everything and filtering by IlmnID.
        snp_df = pd.read_csv(
            manifest_file,
            low_memory=False)
        # 'O' type columns won't match in SigSet, so forcing float64 here. Also, float32 won't cover all probe IDs; must be float64.
        snp_df = snp_df[snp_df['IlmnID'].str.match('rs', na=False)].astype({'AddressA_ID':'float64', 'AddressB_ID':'float64'})
        return snp_df

[docs]    def read_mouse_probes(self, manifest_file):
        """ ILLUMINA_MOUSE contains unique probes whose names begin with 'mu' and 'rp'
        for 'murine' and 'repeat', respectively. This creates a dataframe of these probes,
        which are not processed like normal cg/ch probes. """
        self.seek_to_start(manifest_file)
        mouse_df = pd.read_csv(
            manifest_file,
            low_memory=False) # low_memory=Fase is required because control probes create mixed-types in columns.
        #--- pre v1.4.6: mouse_df = mouse_df[(mouse_df['Probe_Type'] == 'rp') | (mouse_df['IlmnID'].str.startswith('uk', na=False)) | (mouse_df['Probe_Type'] == 'mu')]
        #--- pre v1.4.6: 'mu' probes start with 'cg' instead and have 'mu' in Probe_Type column
        mouse_df = mouse_df[(mouse_df['design'] == 'Multi') | (mouse_df['design'] == 'Random')]
        return mouse_df

    """ NEVER CALLED ANYWHERE - belongs in methylize
    def map_to_genome(self, data_frame):
        genome_df = self.get_genome_data()
        merged_df = inner_join_data(data_frame, genome_df)
        return merged_df

    def get_genome_data(self, build=None):
        if self.__genome_df is not None:
            return self.__genome_df

        LOGGER.info('Building genome data frame')
        # new in version 1.5.6: support for both new and old genomes
        GENOME_COLUMNS = (
            'Genome_Build',
            'CHR',
            'MAPINFO',
            'Strand',
        ) # PLUS four more optional columns with OLD_ prefix (for prev genome build)
        genome_columns = GENOME_COLUMNS + ['OLD_'+col for col in GENOME_COLUMNS]
        self.__genome_df = self.data_frame[genome_columns]
        return self.__genome_df
    """

[docs]    def get_data_types(self):
        data_types = {
            key: str for key in self.columns
        }
        data_types['AddressA_ID'] = 'Int64' #'float64' -- dtype found only in pandas 0.24 or greater
        data_types['AddressB_ID'] = 'Int64' #'float64'
        return data_types

[docs]    def get_probe_details(self, probe_type, channel=None):
        """used by infer_channel_switch. Given a probe type (I, II, SnpI, SnpII, Control) and a channel (Channel.RED | Channel.GREEN),
        this will return info needed to map probes to their names (e.g. cg0031313 or rs00542420), which are NOT in the idat files."""
        if not isinstance(probe_type, ProbeType):
            raise Exception('probe_type is not a valid ProbeType')

        if channel and not isinstance(channel, Channel):
            raise Exception('channel not a valid Channel')

        data_frame = self.data_frame
        probe_type_mask = data_frame['probe_type'].values == probe_type.value

        if not channel:
            return data_frame[probe_type_mask]

        channel_mask = data_frame['Color_Channel'].values == channel.value
        return data_frame[probe_type_mask & channel_mask]